# Train a Language Model to Detect Human Values in Arguments

In [None]:
!pip install transformers torchmetrics

In [30]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch
import torch.nn as nn

from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import pickle


RANDOM_SEED = 99
torch.manual_seed(99)

<torch._C.Generator at 0x7fb1e8473830>

In [31]:
from model.BertDataModule import BertDataModule, BertDataset
from model.BertFineTuner import BertFineTuner, train

### Define Parameters

In [32]:
PARAMS = {
    # Language Model and Hyperparameters
    "MODEL_PATH": 'roberta-base',
    "BATCH_SIZE": 32,
    "ACCUMULATE_GRAD_BATCHES": 1,
    "LR": 1e-5,
    "EPOCHS": 3,
    "OPTIMIZER": 'AdamW',
    "DEVICE": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    "NUM_TRAIN_WORKERS": 4,
    "NUM_VAL_WORKERS": 4,
    "MAX_TOKEN_COUNT":128,
    "RANDOM_SEED": RANDOM_SEED,

    # Early Stopping Params
    "PATIENCE": 3,
    "VAL_CHECK_INTERVAL": 300,
    
    # The metric we optimize for. Alternative "custom_f1/Val" and "max"
    "MAX_THRESHOLD_METRIC": "custom", #The f1-score that should maximized (custom = formula for the task evaluation)
    "EARLY_STOPPING_METRIC": "avg_val_loss",
    "EARLY_STOPPING_MODE": "min",

    # DATA
    "VALIDATION_SET_SIZE":500,

    "TRAIN_PATH" : "./data/data_training_full.csv", #
    "LEAVE_OUT_DATA_PATH": "./data/leave_out_dataset_300.csv",
    "SAVE_PATH": "./model/best_model.pt"

}


## Data Loading


In [33]:
train_df = pd.read_csv(PARAMS["TRAIN_PATH"], index_col=0)
LABEL_COLUMNS = train_df.columns.tolist()[6:]

leave_out_df = pd.read_csv(PARAMS["LEAVE_OUT_DATA_PATH"], index_col=0)

## Model Training

### Linear Learning Rate Schedule

In [34]:
steps_per_epoch=len(train_df) // PARAMS['BATCH_SIZE']
total_training_steps = steps_per_epoch * PARAMS['EPOCHS']
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(171, 856)

### Prepare Data Modules for the Training

In [35]:
train_df, val_df = train_test_split(train_df, test_size=PARAMS["VALIDATION_SET_SIZE"], random_state=PARAMS["RANDOM_SEED"])
TOKENIZER = AutoTokenizer.from_pretrained(PARAMS["MODEL_PATH"])

data_module = BertDataModule(
    train_df=train_df,
    val_df=val_df,
    tokenizer=TOKENIZER,
    params=PARAMS,
    label_columns=LABEL_COLUMNS
)

data_module.setup()
train_loader = data_module.train_dataloader()
val_loader = data_module.val_dataloader()

In [7]:
print(LABEL_COLUMNS)
print(len(LABEL_COLUMNS))

['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources', 'Face', 'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules', 'Conformity: interpersonal', 'Humility', 'Benevolence: caring', 'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity']
20


In [8]:
# Print some examples from the train loader
print("Train Loader Examples:")
for batch in train_loader:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    # Print the input examples
    for i in range(len(input_ids)):
        print("Example", i+1)
        print("Input IDs:", input_ids[i])
        print("Attention Mask:", attention_mask[i])
        print("Labels:", labels[i])
        print()

    # Stop after printing a few examples
    break

# Print some examples from the val loader
print("Val Loader Examples:")
for batch in val_loader:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    # Print the input examples
    for i in range(len(input_ids)):
        print("Example", i+1)
        print("Input IDs:", input_ids[i])
        print("Attention Mask:", attention_mask[i])
        print("Labels:", labels[i])
        print()

    # Stop after 

Train Loader Examples:


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Example 1
Input IDs: tensor([    0,   170,    32, 19836,   131,    42,    16,    45,  8222,    53,
           52,   218,    75,   323,   434,     4,   318,    52,  3264,   171,
         4175,   172,    52,   146,    10,  1307,   912,    11,     5,   434,
            9,     5, 10822,   749,     6,   142,    38,   524,   727,   207,
          686,    14,    51,    40,   393,   655,   213,   124,     7,    49,
          247,     4,    11,  4402,     9,   166,   109,    45,   240,  2447,
           31,   786,    12, 17108,    50,   786,    12,  9502,    12,  8331,
          749,     4,  1437,     2,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1, 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Example 1
Input IDs: tensor([    0, 25539,  8653,    16,    10, 38025,  1760,    14, 24130,     7,
           82,    14,  1900,    16,    10, 10676,  1973,     8, 20993,  3786,
           11,   488,     4,    11,  4402,     9,   166,   197, 31165,   812,
         8653,     2,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1, 

### Calculate weights to address imbalance problem

In [36]:
from sklearn.utils import class_weight

def calculate_class_weights(no_of_classes, samples_per_cls, power=1):
    weights_for_samples = 1.0/np.array(np.power(samples_per_cls,power))
    weights_for_samples = weights_for_samples / np.sum(weights_for_samples) * no_of_classes
    return weights_for_samples

temp_df = pd.concat([train_df, val_df], ignore_index=True)
class_labels = temp_df[LABEL_COLUMNS]
num_ones = class_labels.eq(1).sum()
class_weights = calculate_class_weights(20,num_ones)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
print(class_weights)

[0.60873996 0.4023066  1.9535921  2.66267368 0.36530584 0.99989137
 0.98753007 1.51034011 0.27439767 0.34780933 1.04040795 0.47705501
 2.80828865 1.44072524 0.38486183 0.72253457 0.275449   1.34127219
 0.85382648 0.54299237]


### Create Model

In [37]:
model = BertFineTuner(params=PARAMS, label_columns=LABEL_COLUMNS, n_training_steps=total_training_steps, n_warmup_steps=warmup_steps)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train(
    model = model,
    train_loader = train_loader, 
    val_loader = val_loader, 
    num_epochs = PARAMS['EPOCHS'], 
    learning_rate = PARAMS['LR'], 
    n_warmup_steps = warmup_steps, 
    n_training_steps = total_training_steps,
    save_path = PARAMS['SAVE_PATH'],
    class_weights= class_weights_tensor.to(PARAMS['DEVICE'])
)

Epoch 1/20:   0%|          | 0/399 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

KeyboardInterrupt: 

Now we are done with the training. This process is repeated with several different configurations for the model. More information can be found in the system description paper.

# Evaluation

The predictions for the final submissions are done based on an ensemble.
Hence for ensembling, please continue with the ensemble_eval_and_predict.ipynb notebook.
However, for simplicity or if you are interested, you may want to continue here to evaluate the model performance.

1. We determine the decision threshold to decide when a certain label should be counted as 1, based on the val_data
2. We predict the test_data with it (if splitted above)

We load the model from the best_checkpoint in order to get the model that performed best with respect to the early stopping metric.

In [40]:
trained_weights = torch.load(PARAMS['SAVE_PATH'])
model.load_state_dict(trained_weights)

model.eval()
for param in model.parameters():
    param.requires_grad = False

RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = model.to(device)

predictions = []
labels = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_tensor = batch["labels"].to(device)

        prediction = trained_model(input_ids, attention_mask)

        predictions.append(prediction)
        labels.append(labels_tensor)

predictions = torch.cat(predictions, dim=0).cpu()
labels = torch.cat(labels, dim=0).cpu()

  0%|          | 0/16 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling p

KeyboardInterrupt: 

Select optimal Threshold on Val Dataset

In [67]:
predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

In [72]:
from toolbox.bert_utils import calculate_best_threshold

THRESHOLD, best_f1_score = calculate_best_threshold(labels.numpy(), predictions.numpy())

binarize the predictions with the optimal threshold

In [None]:
y_pred = predictions.numpy()
y_true = labels.numpy()

upper, lower = 1, 0

y_pred = np.where(y_pred > THRESHOLD, upper, lower)

In [None]:
print(f"Threshold: {THRESHOLD}")
print(classification_report(
    y_true,
    y_pred,
    target_names=LABEL_COLUMNS,
    zero_division=0,
))

class_rep = classification_report(
    y_true,
    y_pred,
    target_names=LABEL_COLUMNS,
    zero_division=0,
    output_dict=True
)

# Use Threshold to predict on Test Data
If we want to predict on the test-data (if you have split it apart, alternatively you could use the leave-out-dataset). For a single Model.

In [22]:
test_df = leave_out_df

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

test_dataset = BertDataset(
    test_df,
    tokenizer=TOKENIZER,
    max_token_count=PARAMS["MAX_TOKEN_COUNT"],
    label_columns=LABEL_COLUMNS
)

predictions = []
labels = []

with torch.no_grad():
    for item in tqdm(test_dataset):
        input_ids = item["input_ids"].unsqueeze(dim=0).to(device)
        attention_mask = item["attention_mask"].unsqueeze(dim=0).to(device)

        prediction = trained_model(input_ids, attention_mask)

        predictions.append(prediction.flatten().cpu())
        labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

y_pred = predictions.numpy()
y_true = labels.numpy()

###  Binarize the model predictions with Threshold


In [24]:
upper, lower = 1, 0

y_pred = np.where(y_pred > THRESHOLD, upper, lower)

In [25]:
print(f"Threshold: {THRESHOLD}")
print(classification_report(
    y_true,
    y_pred,
    target_names=LABEL_COLUMNS,
    zero_division=0,
))

class_rep = classification_report(
    y_true,
    y_pred,
    target_names=LABEL_COLUMNS,
    zero_division=0,
    output_dict=True
)

Threshold: 0.25
                            precision    recall  f1-score   support

   Self-direction: thought       0.41      0.92      0.57        49
    Self-direction: action       0.69      0.89      0.78        75
               Stimulation       0.19      0.64      0.29        14
                  Hedonism       0.20      1.00      0.33         5
               Achievement       0.74      0.92      0.82        86
          Power: dominance       0.39      0.91      0.55        33
          Power: resources       0.50      1.00      0.67        26
                      Face       0.24      0.56      0.34        25
        Security: personal       0.71      0.95      0.81       103
        Security: societal       0.69      0.94      0.80        85
                 Tradition       0.54      1.00      0.70        31
         Conformity: rules       0.75      0.91      0.82        81
 Conformity: interpersonal       0.33      0.82      0.47        11
                  Humility     

# Calculate F1-Score

In [26]:
test_custom_f1 = -1
test_macro_recall = class_rep["macro avg"]["recall"]
test_macro_precision = class_rep["macro avg"]["precision"]
if (test_macro_precision + test_macro_recall) != 0:
    test_custom_f1 = (2*test_macro_recall*test_macro_precision/(test_macro_recall+test_macro_precision))
else:
    test_custom_f1 = 0
print(test_custom_f1)

0.6105788216966658


## Test model

In [29]:
test_df_input = pd.read_csv('./data/arguments-test.tsv', sep='\t')
test_df_input["text"] = test_df_input["Premise"]+" " + test_df_input["Stance"]+ " " + test_df_input["Conclusion"]
test_df_input.head()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

test_df_dataset = BertDataset(
    data=test_df_input,
    tokenizer=TOKENIZER,
    max_token_count=PARAMS["MAX_TOKEN_COUNT"],
)

predictions = []

for item in tqdm(test_df_dataset):
    _, prediction = trained_model(
        item["input_ids"].unsqueeze(dim=0).to(device),
        item["attention_mask"].unsqueeze(dim=0).to(device)
    )
    predictions.append(prediction.flatten())

predictions = torch.stack(predictions).detach().cpu()

In [26]:
y_pred = predictions.numpy()
upper, lower = 1, 0
y_pred = np.where(y_pred > THRESHOLD, upper, lower)

In [27]:
prediction_dictionary = {}
prediction_dictionary["Argument ID"] = test_df_input["Argument ID"]
for idx, l_name in enumerate(LABEL_COLUMNS):
  prediction_dictionary[l_name]=y_pred[:,idx]

test_prediction_df = pd.DataFrame(prediction_dictionary)
test_prediction_df.head()

Unnamed: 0,Argument ID,Self-direction: thought,Self-direction: action,Stimulation,Hedonism,Achievement,Power: dominance,Power: resources,Face,Security: personal,...,Tradition,Conformity: rules,Conformity: interpersonal,Humility,Benevolence: caring,Benevolence: dependability,Universalism: concern,Universalism: nature,Universalism: tolerance,Universalism: objectivity
0,A26004,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,A26010,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,A26016,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,1,0,0,1
3,A26024,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,A26026,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [None]:
if RUN_ID:
    test_prediction_df.to_csv(f"submissions/{RUN_ID}-submission_test.txt", sep="\t", index=False)
else:
    test_prediction_df.to_csv(f"submissions/{NAME}-submission_test.txt", sep="\t", index=False)

# Looking at single predictions

In [35]:
def print_example_prediction(record, show_all_probs=False, THRESHOLD=0.3):

    print(record["Argument ID"])
    print(record["text"])
    print(f"True Label: {record.category}")


    encoding = TOKENIZER.encode_plus(
        record.text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors='pt',
    )

    _, test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
    test_prediction = test_prediction.flatten().numpy()

    res = {}
    if show_all_probs:
        for label, prediction in zip(LABEL_COLUMNS, test_prediction):
            print(f"{label}: {prediction}")
            res[label] = prediction

    else:
        print(f"Predictions:")
        for label, prediction in zip(LABEL_COLUMNS, test_prediction):
            if prediction < THRESHOLD:
                continue
            print(f"{label}: {prediction}")
            res[label] = prediction
    return res

In [36]:
# 13 whaling is good one
trained_model.to("cpu")
test_record = test_df.iloc[6]
print_example_prediction(test_record, show_all_probs=False, THRESHOLD=THRESHOLD)


A18309
social media gives it users a place to seek support when in need whether emotional or financially, things that would be more difficult if not impossible to do outside of their home. against Social media brings more harm than good
True Label: ['Self-direction: action', 'Face', 'Security: personal', 'Benevolence: caring', 'Benevolence: dependability']
Predictions:
Self-direction: action: 0.49473991990089417
Stimulation: 0.40371981263160706
Hedonism: 0.4516661763191223
Security: personal: 0.9821780323982239
Benevolence: caring: 0.9349980354309082
Universalism: tolerance: 0.327671617269516


{'Self-direction: action': 0.49473992,
 'Stimulation': 0.4037198,
 'Hedonism': 0.45166618,
 'Security: personal': 0.98217803,
 'Benevolence: caring': 0.93499804,
 'Universalism: tolerance': 0.32767162}