In [1]:
import random
import torch
import numpy as np
import os

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(13370)

In [2]:
!head stage1_training_data.csv

,sentence1,sentence2,gold_label,sent1_readable,sent2_readable,sent1_subj_quant,sent1_subj_adj,sent1_obj_quant,sent1_obj_adj,sent1_adv,sent1_polarity,sent2_subj_quant,sent2_subj_adj,sent2_obj_quant,sent2_obj_adj,sent2_adv,sent2_polarity
8,some slow gymnast emptystring emptystring publicizes notevery emptystring straw,some emptystring gymnast emptystring jealously publicizes every opaque straw,neutral,some slow gymnast publicizes not every straw,some gymnast jealously publicizes every opaque straw,some,yes,notevery,no,no,aff,some,no,every,yes,yes,aff
12,no Spanish receiver emptystring emptystring publicizes some dry lemur,every emptystring receiver emptystring fortunately publicizes every dry lemur,contradiction,no spanish receiver publicizes some dry lemur,every receiver fortunately publicizes every dry lemur,no,yes,some,yes,no,aff,every,no,every,yes,yes,aff
13,no furry linguist emptystring deftly draws no pink lemur,every furry linguist emptystring deftly draws every emptystring lem

In [None]:
# 7 sent1_subj_adj
# 9 sent1_obj_adj
# 10 sent1_adv,sent1_polarity,sent2_subj_quant,sent2_subj_adj,sent2_obj_quant,sent2_obj_adj,sent2_adv,sent2_polarity

sadj1 = set()
subj1 = set()
adv1 = set()
verb1 = set()
oadj1 = set()
obj1 = set()

sadj2 = set()
subj2 = set()
adv2 = set()
verb2 = set()
oadj2 = set()
obj2 = set()

for idx, line in enumerate(open('stage1_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        words = chunks[1].split()
        sadj1.add(words[1])
        subj1.add(words[2])
        adv1.add(words[4])
        verb1.add(words[5])
        oadj1.add(words[7])
        obj1.add(words[8])
        words = chunks[2].split()
        sadj2.add(words[1])
        subj2.add(words[2])
        adv2.add(words[4])
        verb2.add(words[5])
        oadj2.add(words[7])
        obj2.add(words[8])

set_random_seed(0xABBA+1)

sadjs = list(sadj1-set(('emptystring',)))
np.random.shuffle(sadjs)

subjs = list(subj1-set(('emptystring',)))
np.random.shuffle(subjs)

objs = list(obj1-set(('emptystring',)))
np.random.shuffle(objs)

advs = list(adv1-set(('emptystring',)))
np.random.shuffle(advs)

oadjs = list(oadj1-set(('emptystring',)))
np.random.shuffle(oadjs)

verbs = list(verb1-set(('emptystring',)))
np.random.shuffle(verbs)

TRAIN_PART = 60
VALID_PART = 1000

train_sadjs = sadjs[:TRAIN_PART]+['emptystring',]
train_subjs = subjs[:TRAIN_PART]+['emptystring',]
train_objs = objs[:TRAIN_PART]+['emptystring',]
train_advs = advs[:TRAIN_PART]+['emptystring',]
train_oadjs = oadjs[:TRAIN_PART]+['emptystring',]
train_verbs = verbs[:TRAIN_PART]+['emptystring',]

valid_sadjs = sadjs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_subjs = subjs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_objs =  objs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_advs =  advs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_oadjs = oadjs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_verbs = verbs[TRAIN_PART:VALID_PART]+['emptystring',]


In [62]:
print(train_sadjs)
print(valid_sadjs)

['beautiful', 'terrible', 'scaly', 'savvy', 'preposterous', 'slow', 'charismatic', 'Mongolian', 'untrustworthy', 'friendly', 'lucky', 'fuzzy', 'furry', 'antagonistic', 'Israeli', 'Italian', 'Texan', 'Korean', 'Latvian', 'stupid', 'unhelpful', 'rough', 'Panamanian', 'Mexican', 'Nebraskan', 'uncouth', 'Chinese', 'kooky', 'wild', 'sheltered', 'sneaky', 'coy', 'small', 'irresistible', 'loving', 'French', 'sly', 'Indian', 'Austrian', 'jealous', 'Spanish', 'insane', 'vivacious', 'happy', 'idealistic', 'Moroccan', 'rude', 'angry', 'Ukranian', 'devout', 'Pakistani', 'Canadian', 'big', 'polite', 'chummy', 'soft', 'surly', 'scatterbrained', 'taciturn', 'seductive', 'emptystring']
['outstanding', 'unpredictable', 'boisterous', 'thoughtful', 'Oklahoman', 'Alabaman', 'religious', 'helpful', 'talkative', 'smooth', 'Afghani', 'noble', 'proud', 'silly', 'Columbian', 'Siberian', 'crazy', 'Hawaiian', 'Oregonian', 'English', 'Indonesian', 'Alaskan', 'Mississippian', 'Californian', 'unhinged', 'horrifying

In [63]:
records = []

for idx, line in enumerate(open('stage1_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        records.append( {'left':chunks[4], 'right': chunks[5], 'label':chunks[3]})

np.random.shuffle(records)
print(len(records))

475998


In [64]:
train_records = []

for idx, line in enumerate(open('stage1_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        words = chunks[1].split()
        if  words[1] not in train_sadjs or \
            words[2] not in train_subjs or \
            words[4] not in train_advs or \
            words[5] not in train_verbs or \
            words[7] not in train_oadjs or \
            words[8] not in train_objs: continue
        words = chunks[2].split()
        if  words[1] not in train_sadjs or \
            words[2] not in train_subjs or \
            words[4] not in train_advs or \
            words[5] not in train_verbs or \
            words[7] not in train_oadjs or \
            words[8] not in train_objs: continue
        train_records.append( {'left':chunks[4], 'right': chunks[5], 'label':chunks[3]})

np.random.shuffle(train_records)
print(len(train_records))

19806


In [67]:
val_records = []

for idx, line in enumerate(open('stage1_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        words = chunks[1].split()
        if  words[1] not in valid_sadjs or \
            words[2] not in valid_subjs or \
            words[8] not in valid_objs: continue
        words = chunks[2].split()
        if  words[1] not in valid_sadjs or \
            words[2] not in valid_subjs or \
            words[4] not in valid_advs or \
            words[5] not in valid_verbs or \
            words[7] not in valid_oadjs or \
            words[8] not in valid_objs: continue
        val_records.append( {'left':chunks[4], 'right': chunks[5], 'label':chunks[3]})

np.random.shuffle(val_records)
print(len(val_records))
# val_records

3503


In [68]:
test_records = val_records[:len(val_records)//2]
val_records = val_records[-len(val_records)//2:]
print(len(test_records), len(val_records))

1751 1752


In [4]:
VAL_SHARE = .1
TEST_SHARE = .1

val_records = records[:int(len(records)*VAL_SHARE)]
test_records = records[-int(len(records)*TEST_SHARE):]
train_records = records[int(len(records)*VAL_SHARE):-int(len(records)*TEST_SHARE)]

In [69]:
from collections import Counter

labels_counter = Counter([r["label"] for r in records])
print(labels_counter, sum(labels_counter.values()))
labels = list(labels_counter.keys())
print(labels)

Counter({'contradiction': 319508, 'neutral': 317800, 'entailment': 314688}) 951996
['neutral', 'entailment', 'contradiction']


In [70]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class NLIsDataset(Dataset):
    def __init__(self, records, tokenizer, max_tokens, labels):
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens
        self.records = records
        self.labels = labels
    
    def __len__(self):
        return len(self.records)
    
    def embed_record(self, record):
        inputs = self.tokenizer(
            text=record["left"],
            text_pair=record["right"],
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding="max_length",
            truncation="longest_first",
            return_tensors='pt'
        )
        for key, value in inputs.items():
            value.squeeze_(0)
        return inputs
    
    def __getitem__(self, index):
        record = self.records[index]
        output = self.embed_record(record)
        label = record.get("label", None)
        if label is not None and label in self.labels:
            output["labels"] = torch.tensor(self.labels.index(label))
        return output

In [74]:

MODEL_NAME = 'bert-base-multilingual-uncased'
TOKENIZER_NAME = MODEL_NAME
MAX_TOKENS = 100
EPOCHS = 4*32 # 4
EVAL_STEPS = 32*4 # 32
WARMUP_STEPS = 16
LR = 0.00002
BATCH_SIZE = 32*4 # 32
GRAD_ACCUM_STEPS = 1 # 4
PATIENCE = 3

In [75]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, do_lower_case=False)
train_data = NLIsDataset(train_records, tokenizer, MAX_TOKENS, labels)
val_data = NLIsDataset(val_records, tokenizer, MAX_TOKENS, labels)

loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

In [76]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
model = model.to("cuda")

callbacks = [EarlyStoppingCallback(early_stopping_patience=PATIENCE)]

training_args = TrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    report_to="none",
    load_best_model_at_end=True,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    callbacks=callbacks
)

!rm -rf checkpoints
trainer.train()

loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19806
  Num Epochs = 128
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 19840


Step,Training Loss,Validation Loss
128,0.8774,0.671406
256,0.6507,0.649886
384,0.5897,0.52726
512,0.5259,0.523474
640,0.5276,0.531081
768,0.5222,0.519647
896,0.5234,0.518913
1024,0.5148,0.523163
1152,0.5079,0.522633
1280,0.5054,0.547371


***** Running Evaluation *****
  Num examples = 1752
  Batch size = 128
Saving model checkpoint to checkpoints/checkpoint-128
Configuration saved in checkpoints/checkpoint-128/config.json
Model weights saved in checkpoints/checkpoint-128/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1752
  Batch size = 128
Saving model checkpoint to checkpoints/checkpoint-256
Configuration saved in checkpoints/checkpoint-256/config.json
Model weights saved in checkpoints/checkpoint-256/pytorch_model.bin
Deleting older checkpoint [checkpoints/checkpoint-128] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1752
  Batch size = 128
Saving model checkpoint to checkpoints/checkpoint-384
Configuration saved in checkpoints/checkpoint-384/config.json
Model weights saved in checkpoints/checkpoint-384/pytorch_model.bin
Deleting older checkpoint [checkpoints/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1752
  Batch siz

TrainOutput(global_step=1280, training_loss=0.5745137512683869, metrics={'train_runtime': 1033.3974, 'train_samples_per_second': 2453.236, 'train_steps_per_second': 19.199, 'total_flos': 1.64247189621984e+16, 'train_loss': 0.5745137512683869, 'epoch': 8.26})

In [77]:
from tqdm.notebook import tqdm

def get_batch(data, batch_size):
    start_index = 0
    while start_index < len(data):
        end_index = start_index + batch_size
        batch = data[start_index:end_index]
        yield batch
        start_index = end_index

def pipe_predict(data, pipe, batch_size=64):
    raw_preds = []
    for batch in tqdm(get_batch(data, batch_size)):
        raw_preds += pipe(batch)
    preds = np.array([int(max(labels, key=lambda x: x["score"])["label"][-1]) for labels in raw_preds])
    pp = np.array([[l["score"] for l in labels] for labels in raw_preds])
    return preds, pp

In [78]:
from transformers import pipeline
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

model.eval()
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, framework="pt", device=0, return_all_scores=True)

y_true = np.array([labels.index(r["label"]) for r in test_records], dtype=np.int32)
test_pairs = [(r["left"], r["right"]) for r in test_records]

y_pred, y_pred_prob = pipe_predict(test_pairs, pipe)

print(classification_report(y_true, y_pred, digits=3))
print(confusion_matrix(y_true, y_pred))


0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.992     0.761     0.861       494
           1      0.591     0.932     0.723       633
           2      0.759     0.455     0.569       624

    accuracy                          0.714      1751
   macro avg      0.781     0.716     0.718      1751
weighted avg      0.764     0.714     0.707      1751

[[376  69  49]
 [  2 590  41]
 [  1 339 284]]


In [17]:
### load model
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback


2021-11-09 14:52:02.310903: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [80]:
records2 = []

for idx, line in enumerate(open('stage2_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        records2.append( {'left':chunks[4], 'right': chunks[5], 'label':chunks[3]})

np.random.shuffle(records2)
# records2[:5]
VAL_SHARE = .1
TEST_SHARE = .1

val_records2 = records2[:int(len(records2)*VAL_SHARE)]
test_records2 = records2[-int(len(records2)*TEST_SHARE):]
train_records2 = records2[int(len(records2)*VAL_SHARE):-int(len(records2)*TEST_SHARE)]

# tokenizer.convert_ids_to_tokens(val_data2[0]['input_ids'])[:10]

In [81]:
MAX_TOKENS = 100
EPOCHS = 4 # 4
EVAL_STEPS = 32 # 32
WARMUP_STEPS = 16
LR = 0.00002
BATCH_SIZE = 32*4 # 32
GRAD_ACCUM_STEPS = 1 # 4
PATIENCE = 3

In [84]:
for run in range(40):
    print('===',run)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
    tokenizer.add_tokens(['[NOT]','[FEW]','[MANY]'], special_tokens=True)
    model = AutoModelForSequenceClassification.from_pretrained('checkpoints/checkpoint-896', num_labels=len(labels))
    model = model.to("cuda")
    model.resize_token_embeddings(len(tokenizer))    
    
    train_data2 = NLIsDataset(train_records2, tokenizer, MAX_TOKENS, labels)
    val_data2 = NLIsDataset(val_records2, tokenizer, MAX_TOKENS, labels)

    print(tokenizer.convert_ids_to_tokens(val_data2[0]['input_ids'])[:10])
    
    model.train()

    callbacks = [EarlyStoppingCallback(early_stopping_patience=PATIENCE)]

    training_args = TrainingArguments(
        output_dir="checkpoints2mr",
        evaluation_strategy="steps",
        save_strategy="steps",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        logging_steps=EVAL_STEPS,
        save_steps=EVAL_STEPS,
        warmup_steps=WARMUP_STEPS,
        learning_rate=LR,
        num_train_epochs=EPOCHS,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        report_to="none",
        load_best_model_at_end=True,
        save_total_limit=2
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data2,
        eval_dataset=val_data2,
        callbacks=callbacks
    )

    !rm -rf checkpoints2
    trainer.train()    
    
    model.eval()
    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, framework="pt", device=0, return_all_scores=True)

    y_true2 = np.array([labels.index(r["label"]) for r in test_records2], dtype=np.int32)
    test_pairs2 = [(r["left"], r["right"]) for r in test_records2]

    y_pred2, y_pred_prob2 = pipe_predict(test_pairs2, pipe)

    print(classification_report(y_true2, y_pred2, digits=3))
    print(confusion_matrix(y_true2, y_pred2))    
    
    not_embd = model.bert.embeddings.word_embeddings.weight.data[[tokenizer.vocab['[NOT]']]] # .cpu().detach().numpy())
    few_embd = model.bert.embeddings.word_embeddings.weight.data[[tokenizer.vocab['[FEW]']]]
    many_embd = model.bert.embeddings.word_embeddings.weight.data[[tokenizer.vocab['[MANY]']]]

    np.savetxt(f'embd.not.mbert.{run:02}.txt', not_embd.cpu().detach().numpy())
    np.savetxt(f'embd.few.mbert.{run:02}.txt', few_embd.cpu().detach().numpy())
    np.savetxt(f'embd.many.mbert.{run:02}.txt', many_embd.cpu().detach().numpy())

=== 0


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481728
64,0.4702,0.473604
96,0.4436,0.433418
128,0.341,0.294437
160,0.285,0.268062
192,0.2462,0.267965
224,0.2515,0.259124
256,0.2511,0.256302
288,0.2459,0.258736
320,0.2345,0.257354


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-128

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.996     0.887     0.938       781
           1      0.866     0.923     0.893       944
           2      0.844     0.873     0.858       675

    accuracy                          0.897      2400
   macro avg      0.902     0.894     0.897      2400
weighted avg      0.902     0.897     0.898      2400

[[693  52  36]
 [  0 871  73]
 [  3  83 589]]
=== 1


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 2


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 3


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
=== 4


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6091,0.485666
64,0.4684,0.471399
96,0.4556,0.466104
128,0.4372,0.404997
160,0.3432,0.288111
192,0.2629,0.269618
224,0.2537,0.260253
256,0.2499,0.253421
288,0.2414,0.257515
320,0.2324,0.254189


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.883     0.938       781
           1      0.901     0.891     0.896       944
           2      0.811     0.933     0.868       675

    accuracy                          0.900      2400
   macro avg      0.904     0.903     0.901      2400
weighted avg      0.908     0.900     0.902      2400

[[690  47  44]
 [  0 841 103]
 [  0  45 630]]
=== 5


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 6


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 7


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
=== 8


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6091,0.485666
64,0.4684,0.471399
96,0.4556,0.466104
128,0.4372,0.404997
160,0.3432,0.288111
192,0.2629,0.269618
224,0.2537,0.260253
256,0.2499,0.253421
288,0.2414,0.257515
320,0.2324,0.254189


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.883     0.938       781
           1      0.901     0.891     0.896       944
           2      0.811     0.933     0.868       675

    accuracy                          0.900      2400
   macro avg      0.904     0.903     0.901      2400
weighted avg      0.908     0.900     0.902      2400

[[690  47  44]
 [  0 841 103]
 [  0  45 630]]
=== 9


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 10


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 11


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
=== 12


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6091,0.485666
64,0.4684,0.471399
96,0.4556,0.466104
128,0.4372,0.404997
160,0.3432,0.288111
192,0.2629,0.269618
224,0.2537,0.260253
256,0.2499,0.253421
288,0.2414,0.257515
320,0.2324,0.254189


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.883     0.938       781
           1      0.901     0.891     0.896       944
           2      0.811     0.933     0.868       675

    accuracy                          0.900      2400
   macro avg      0.904     0.903     0.901      2400
weighted avg      0.908     0.900     0.902      2400

[[690  47  44]
 [  0 841 103]
 [  0  45 630]]
=== 13


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 14


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 15


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
=== 16


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6091,0.485666
64,0.4684,0.471399
96,0.4556,0.466104
128,0.4372,0.404997
160,0.3432,0.288111
192,0.2629,0.269618
224,0.2537,0.260253
256,0.2499,0.253421
288,0.2414,0.257515
320,0.2324,0.254189


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.883     0.938       781
           1      0.901     0.891     0.896       944
           2      0.811     0.933     0.868       675

    accuracy                          0.900      2400
   macro avg      0.904     0.903     0.901      2400
weighted avg      0.908     0.900     0.902      2400

[[690  47  44]
 [  0 841 103]
 [  0  45 630]]
=== 17


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 18


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 19


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
=== 20


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6091,0.485666
64,0.4684,0.471399
96,0.4556,0.466104
128,0.4372,0.404997
160,0.3432,0.288111
192,0.2629,0.269618
224,0.2537,0.260253
256,0.2499,0.253421
288,0.2414,0.257515
320,0.2324,0.254189


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.883     0.938       781
           1      0.901     0.891     0.896       944
           2      0.811     0.933     0.868       675

    accuracy                          0.900      2400
   macro avg      0.904     0.903     0.901      2400
weighted avg      0.908     0.900     0.902      2400

[[690  47  44]
 [  0 841 103]
 [  0  45 630]]
=== 21


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 22


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 23


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
=== 24


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6091,0.485666
64,0.4684,0.471399
96,0.4556,0.466104
128,0.4372,0.404997
160,0.3432,0.288111
192,0.2629,0.269618
224,0.2537,0.260253
256,0.2499,0.253421
288,0.2414,0.257515
320,0.2324,0.254189


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.883     0.938       781
           1      0.901     0.891     0.896       944
           2      0.811     0.933     0.868       675

    accuracy                          0.900      2400
   macro avg      0.904     0.903     0.901      2400
weighted avg      0.908     0.900     0.902      2400

[[690  47  44]
 [  0 841 103]
 [  0  45 630]]
=== 25


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 26


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 27


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
=== 28


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6091,0.485666
64,0.4684,0.471399
96,0.4556,0.466104
128,0.4372,0.404997
160,0.3432,0.288111
192,0.2629,0.269618
224,0.2537,0.260253
256,0.2499,0.253421
288,0.2414,0.257515
320,0.2324,0.254189


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.883     0.938       781
           1      0.901     0.891     0.896       944
           2      0.811     0.933     0.868       675

    accuracy                          0.900      2400
   macro avg      0.904     0.903     0.901      2400
weighted avg      0.908     0.900     0.902      2400

[[690  47  44]
 [  0 841 103]
 [  0  45 630]]
=== 29


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 30


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 31


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
=== 32


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6091,0.485666
64,0.4684,0.471399
96,0.4556,0.466104
128,0.4372,0.404997
160,0.3432,0.288111
192,0.2629,0.269618
224,0.2537,0.260253
256,0.2499,0.253421
288,0.2414,0.257515
320,0.2324,0.254189


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.883     0.938       781
           1      0.901     0.891     0.896       944
           2      0.811     0.933     0.868       675

    accuracy                          0.900      2400
   macro avg      0.904     0.903     0.901      2400
weighted avg      0.908     0.900     0.902      2400

[[690  47  44]
 [  0 841 103]
 [  0  45 630]]
=== 33


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 34


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 35


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
=== 36


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6091,0.485666
64,0.4684,0.471399
96,0.4556,0.466104
128,0.4372,0.404997
160,0.3432,0.288111
192,0.2629,0.269618
224,0.2537,0.260253
256,0.2499,0.253421
288,0.2414,0.257515
320,0.2324,0.254189


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.883     0.938       781
           1      0.901     0.891     0.896       944
           2      0.811     0.933     0.868       675

    accuracy                          0.900      2400
   macro avg      0.904     0.903     0.901      2400
weighted avg      0.908     0.900     0.902      2400

[[690  47  44]
 [  0 841 103]
 [  0  45 630]]
=== 37


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5762,0.47737
64,0.4628,0.467086
96,0.4456,0.434319
128,0.3506,0.305
160,0.2816,0.271644
192,0.2393,0.274748
224,0.2561,0.256551
256,0.2473,0.254168
288,0.2427,0.256164
320,0.2343,0.252852


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.893     0.892     0.892       944
           2      0.815     0.924     0.866       675

    accuracy                          0.899      2400
   macro avg      0.903     0.900     0.899      2400
weighted avg      0.906     0.899     0.900      2400

[[691  50  40]
 [  0 842 102]
 [  0  51 624]]
=== 38


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.5823,0.481902
64,0.472,0.473886
96,0.4516,0.467655
128,0.4413,0.445476
160,0.3881,0.309914
192,0.2734,0.280694
224,0.2648,0.263402
256,0.2578,0.257509
288,0.2463,0.256616
320,0.2362,0.258732


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-448] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.903     0.892     0.898       944
           2      0.815     0.938     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.905     0.903      2400
weighted avg      0.910     0.902     0.904      2400

[[691  48  42]
 [  0 842 102]
 [  0  42 633]]
=== 39


loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

['[CLS]', 'every', 'thought', '##ful', 'w', '##his', '##tler', 'tac', '##itur', '##nl']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.603,0.488299
64,0.4685,0.468142
96,0.4518,0.458383
128,0.4052,0.329172
160,0.3022,0.271678
192,0.2474,0.269684
224,0.2534,0.256759
256,0.2486,0.254095
288,0.2431,0.253966
320,0.234,0.255008


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-32
Configuration saved in checkpoints2mr/checkpoint-32/config.json
Model weights saved in checkpoints2mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-64
Configuration saved in checkpoints2mr/checkpoint-64/config.json
Model weights saved in checkpoints2mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 128
Saving model checkpoint to checkpoints2mr/checkpoint-96
Configuration saved in checkpoints2mr/checkpoint-96/config.json
Model weights saved in checkpoints2mr/checkpoint-96/pytorch_model.bin
Deleting older checkpoint [checkpoints2m

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      1.000     0.885     0.939       781
           1      0.920     0.874     0.896       944
           2      0.798     0.960     0.872       675

    accuracy                          0.902      2400
   macro avg      0.906     0.906     0.902      2400
weighted avg      0.912     0.902     0.903      2400

[[691  45  45]
 [  0 825 119]
 [  0  27 648]]
