In [1]:
import random
import torch
import numpy as np
import os

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(13370)

In [2]:
!head stage1_training_data.csv

,sentence1,sentence2,gold_label,sent1_readable,sent2_readable,sent1_subj_quant,sent1_subj_adj,sent1_obj_quant,sent1_obj_adj,sent1_adv,sent1_polarity,sent2_subj_quant,sent2_subj_adj,sent2_obj_quant,sent2_obj_adj,sent2_adv,sent2_polarity
8,some slow gymnast emptystring emptystring publicizes notevery emptystring straw,some emptystring gymnast emptystring jealously publicizes every opaque straw,neutral,some slow gymnast publicizes not every straw,some gymnast jealously publicizes every opaque straw,some,yes,notevery,no,no,aff,some,no,every,yes,yes,aff
12,no Spanish receiver emptystring emptystring publicizes some dry lemur,every emptystring receiver emptystring fortunately publicizes every dry lemur,contradiction,no spanish receiver publicizes some dry lemur,every receiver fortunately publicizes every dry lemur,no,yes,some,yes,no,aff,every,no,every,yes,yes,aff
13,no furry linguist emptystring deftly draws no pink lemur,every furry linguist emptystring deftly draws every emptystring lem

In [None]:
# 7 sent1_subj_adj
# 9 sent1_obj_adj
# 10 sent1_adv,sent1_polarity,sent2_subj_quant,sent2_subj_adj,sent2_obj_quant,sent2_obj_adj,sent2_adv,sent2_polarity

sadj1 = set()
subj1 = set()
adv1 = set()
verb1 = set()
oadj1 = set()
obj1 = set()

sadj2 = set()
subj2 = set()
adv2 = set()
verb2 = set()
oadj2 = set()
obj2 = set()

for idx, line in enumerate(open('stage1_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        words = chunks[1].split()
        sadj1.add(words[1])
        subj1.add(words[2])
        adv1.add(words[4])
        verb1.add(words[5])
        oadj1.add(words[7])
        obj1.add(words[8])
        words = chunks[2].split()
        sadj2.add(words[1])
        subj2.add(words[2])
        adv2.add(words[4])
        verb2.add(words[5])
        oadj2.add(words[7])
        obj2.add(words[8])

set_random_seed(0xABBA+1)

sadjs = list(sadj1-set(('emptystring',)))
np.random.shuffle(sadjs)

subjs = list(subj1-set(('emptystring',)))
np.random.shuffle(subjs)

objs = list(obj1-set(('emptystring',)))
np.random.shuffle(objs)

advs = list(adv1-set(('emptystring',)))
np.random.shuffle(advs)

oadjs = list(oadj1-set(('emptystring',)))
np.random.shuffle(oadjs)

verbs = list(verb1-set(('emptystring',)))
np.random.shuffle(verbs)

TRAIN_PART = 60
VALID_PART = 1000

train_sadjs = sadjs[:TRAIN_PART]+['emptystring',]
train_subjs = subjs[:TRAIN_PART]+['emptystring',]
train_objs = objs[:TRAIN_PART]+['emptystring',]
train_advs = advs[:TRAIN_PART]+['emptystring',]
train_oadjs = oadjs[:TRAIN_PART]+['emptystring',]
train_verbs = verbs[:TRAIN_PART]+['emptystring',]

valid_sadjs = sadjs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_subjs = subjs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_objs =  objs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_advs =  advs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_oadjs = oadjs[TRAIN_PART:VALID_PART]+['emptystring',]
valid_verbs = verbs[TRAIN_PART:VALID_PART]+['emptystring',]


In [4]:
print(train_sadjs)
print(valid_sadjs)

['Ukranian', 'helpful', 'proud', 'slimy', 'Mississippian', 'German', 'Spanish', 'stupid', 'thoughtful', 'noble', 'outstanding', 'lucky', 'polite', 'Indian', 'fuzzy', 'crazy', 'devout', 'hopeful', 'Alaskan', 'English', 'Panamanian', 'Italian', 'religious', 'small', 'beautiful', 'angry', 'rude', 'Indonesian', 'insane', 'scatterbrained', 'unhinged', 'Texan', 'Korean', 'sly', 'friendly', 'idealistic', 'uncouth', 'rough', 'loving', 'Polish', 'untrustworthy', 'Austrian', 'furry', 'charismatic', 'Mongolian', 'Latvian', 'quick', 'unpredictable', 'Moroccan', 'Californian', 'kooky', 'Mexican', 'Alabaman', 'smooth', 'Afghani', 'Chinese', 'unhelpful', 'surly', 'underwhelming', 'Washingtonian', 'emptystring']
['Oregonian', 'soft', 'French', 'chummy', 'silly', 'Oklahoman', 'boisterous', 'Canadian', 'horrifying', 'coy', 'sheltered', 'taciturn', 'seductive', 'Japanese', 'vivacious', 'Israeli', 'Nebraskan', 'Siberian', 'happy', 'sneaky', 'Hawaiian', 'big', 'slow', 'sad', 'savvy', 'Lithuanian', 'Pakista

In [5]:
records = []

for idx, line in enumerate(open('stage1_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        records.append( {'left':chunks[4], 'right': chunks[5], 'label':chunks[3]})

np.random.shuffle(records)
print(len(records))

475998


In [6]:
train_records = []

for idx, line in enumerate(open('stage1_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        words = chunks[1].split()
        if  words[1] not in train_sadjs or \
            words[2] not in train_subjs or \
            words[4] not in train_advs or \
            words[5] not in train_verbs or \
            words[7] not in train_oadjs or \
            words[8] not in train_objs: continue
        words = chunks[2].split()
        if  words[1] not in train_sadjs or \
            words[2] not in train_subjs or \
            words[4] not in train_advs or \
            words[5] not in train_verbs or \
            words[7] not in train_oadjs or \
            words[8] not in train_objs: continue
        train_records.append( {'left':chunks[4], 'right': chunks[5], 'label':chunks[3]})

np.random.shuffle(train_records)
print(len(train_records))

20117


In [7]:
val_records = []

for idx, line in enumerate(open('stage1_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        words = chunks[1].split()
        if  words[1] not in valid_sadjs or \
            words[2] not in valid_subjs or \
            words[8] not in valid_objs: continue
        words = chunks[2].split()
        if  words[1] not in valid_sadjs or \
            words[2] not in valid_subjs or \
            words[4] not in valid_advs or \
            words[5] not in valid_verbs or \
            words[7] not in valid_oadjs or \
            words[8] not in valid_objs: continue
        val_records.append( {'left':chunks[4], 'right': chunks[5], 'label':chunks[3]})

np.random.shuffle(val_records)
print(len(val_records))
# val_records

3622


In [8]:
test_records = val_records[:len(val_records)//2]
val_records = val_records[-len(val_records)//2:]
print(len(test_records), len(val_records))

1811 1811


In [9]:
# VAL_SHARE = .1
# TEST_SHARE = .1

# val_records = records[:int(len(records)*VAL_SHARE)]
# test_records = records[-int(len(records)*TEST_SHARE):]
# train_records = records[int(len(records)*VAL_SHARE):-int(len(records)*TEST_SHARE)]

In [10]:
from collections import Counter

labels_counter = Counter([r["label"] for r in records])
print(labels_counter, sum(labels_counter.values()))
labels = list(labels_counter.keys())
print(labels)

Counter({'contradiction': 159754, 'neutral': 158900, 'entailment': 157344}) 475998
['entailment', 'contradiction', 'neutral']


In [11]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class NLIsDataset(Dataset):
    def __init__(self, records, tokenizer, max_tokens, labels):
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens
        self.records = records
        self.labels = labels
    
    def __len__(self):
        return len(self.records)
    
    def embed_record(self, record):
        inputs = self.tokenizer(
            text=record["left"],
            text_pair=record["right"],
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding="max_length",
            truncation="longest_first",
            return_tensors='pt'
        )
        for key, value in inputs.items():
            value.squeeze_(0)
        return inputs
    
    def __getitem__(self, index):
        record = self.records[index]
        output = self.embed_record(record)
        label = record.get("label", None)
        if label is not None and label in self.labels:
            output["labels"] = torch.tensor(self.labels.index(label))
        return output

In [12]:
MODEL_NAME = "xlm-roberta-large"
# MODEL_NAME = 'bert-base-multilingual-uncased'
TOKENIZER_NAME = MODEL_NAME
MAX_TOKENS = 100
EPOCHS = 4*32 # 4
EVAL_STEPS = 32*4 # 32
WARMUP_STEPS = 16
LR = 0.00002
BATCH_SIZE = 32*4 # 32
GRAD_ACCUM_STEPS = 1 # 4
PATIENCE = 3

BATCH_SIZE = 32
GRAD_ACCUM_STEPS = 4


In [13]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, do_lower_case=False)
train_data = NLIsDataset(train_records, tokenizer, MAX_TOKENS, labels)
val_data = NLIsDataset(val_records, tokenizer, MAX_TOKENS, labels)

In [14]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
model = model.to("cuda")

callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]

training_args = TrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    report_to="none",
    load_best_model_at_end=True,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    callbacks=callbacks
)

!rm -rf checkpoints
trainer.train()

2021-11-10 20:30:01.627929: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of X

Step,Training Loss,Validation Loss
128,1.0366,0.788705
256,0.6778,0.534601
384,0.5378,0.588231
512,0.5437,0.597887


***** Running Evaluation *****
  Num examples = 1793
  Batch size = 32
Saving model checkpoint to checkpoints/checkpoint-128
Configuration saved in checkpoints/checkpoint-128/config.json
Model weights saved in checkpoints/checkpoint-128/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1793
  Batch size = 32
Saving model checkpoint to checkpoints/checkpoint-256
Configuration saved in checkpoints/checkpoint-256/config.json
Model weights saved in checkpoints/checkpoint-256/pytorch_model.bin
Deleting older checkpoint [checkpoints/checkpoint-128] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1793
  Batch size = 32
Saving model checkpoint to checkpoints/checkpoint-384
Configuration saved in checkpoints/checkpoint-384/config.json
Model weights saved in checkpoints/checkpoint-384/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1793
  Batch size = 32
Saving model checkpoint to checkpoints/checkpoint-512
Configuration saved in che

TrainOutput(global_step=512, training_loss=0.6989619880914688, metrics={'train_runtime': 1557.4109, 'train_samples_per_second': 1631.015, 'train_steps_per_second': 12.739, 'total_flos': 2.20209475664142e+16, 'train_loss': 0.6989619880914688, 'epoch': 3.3})

In [14]:
from tqdm.notebook import tqdm

def get_batch(data, batch_size):
    start_index = 0
    while start_index < len(data):
        end_index = start_index + batch_size
        batch = data[start_index:end_index]
        yield batch
        start_index = end_index

def pipe_predict(data, pipe, batch_size=64):
    raw_preds = []
    for batch in tqdm(get_batch(data, batch_size)):
        raw_preds += pipe(batch)
    preds = np.array([int(max(labels, key=lambda x: x["score"])["label"][-1]) for labels in raw_preds])
    pp = np.array([[l["score"] for l in labels] for labels in raw_preds])
    return preds, pp

In [19]:
from transformers import pipeline
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

model.eval()
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, framework="pt", device=0, return_all_scores=True)

y_true = np.array([labels.index(r["label"]) for r in test_records], dtype=np.int32)
test_pairs = [(r["left"], r["right"]) for r in test_records]

y_pred, y_pred_prob = pipe_predict(test_pairs, pipe)

print(classification_report(y_true, y_pred, digits=3))
print(confusion_matrix(y_true, y_pred))


  '"sox" backend is being deprecated. '


0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.630     0.703     0.665       667
           1      0.630     0.635     0.632       654
           2      0.939     0.782     0.853       490

    accuracy                          0.700      1811
   macro avg      0.733     0.706     0.717      1811
weighted avg      0.714     0.700     0.704      1811

[[469 198   0]
 [214 415  25]
 [ 61  46 383]]


In [15]:
### load model
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback


2021-11-10 22:41:31.197354: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [16]:
records2 = []

for idx, line in enumerate(open('stage2_training_data.csv')):
    if idx:
        chunks = line.strip().split(',')
        assert len(chunks)==18, 'parse error'
        records2.append( {'left':chunks[4], 'right': chunks[5], 'label':chunks[3]})

np.random.shuffle(records2)
# records2[:5]
VAL_SHARE = .1
TEST_SHARE = .1

val_records2 = records2[:int(len(records2)*VAL_SHARE)]
test_records2 = records2[-int(len(records2)*TEST_SHARE):]
train_records2 = records2[int(len(records2)*VAL_SHARE):-int(len(records2)*TEST_SHARE)]

# tokenizer.convert_ids_to_tokens(val_data2[0]['input_ids'])[:10]

In [17]:
MAX_TOKENS = 100
EPOCHS = 4 # 4
EVAL_STEPS = 32 # 32
WARMUP_STEPS = 16
LR = 0.00002
BATCH_SIZE = 32 # 32
GRAD_ACCUM_STEPS = 4 # 4
PATIENCE = 3

In [None]:
for run in range(40):
    print('===',run)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
    tokenizer.add_tokens(['[NOT]','[FEW]','[MANY]'], special_tokens=True)
    model = AutoModelForSequenceClassification.from_pretrained('checkpoints/checkpoint-256', num_labels=len(labels))
    model = model.to("cuda")
    model.resize_token_embeddings(len(tokenizer))    
    
    np.random.shuffle(train_records2)
    train_data2 = NLIsDataset(train_records2, tokenizer, MAX_TOKENS, labels)
    val_data2 = NLIsDataset(val_records2, tokenizer, MAX_TOKENS, labels)

    print(tokenizer.convert_ids_to_tokens(val_data2[0]['input_ids'])[:10])
    
    model.train()

    callbacks = [EarlyStoppingCallback(early_stopping_patience=PATIENCE)]

    training_args = TrainingArguments(
        output_dir="checkpoints2-xlmr-mr",
        evaluation_strategy="steps",
        save_strategy="steps",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        logging_steps=EVAL_STEPS,
        save_steps=EVAL_STEPS,
        warmup_steps=WARMUP_STEPS,
        learning_rate=LR,
        num_train_epochs=EPOCHS,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        report_to="none",
        load_best_model_at_end=True,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data2,
        eval_dataset=val_data2,
        callbacks=callbacks
    )

    !rm -rf checkpoints2
    trainer.train()    
    
    model.eval()
    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, framework="pt", device=0, return_all_scores=True)

    y_true2 = np.array([labels.index(r["label"]) for r in test_records2], dtype=np.int32)
    test_pairs2 = [(r["left"], r["right"]) for r in test_records2]

    y_pred2, y_pred_prob2 = pipe_predict(test_pairs2, pipe)

    print(classification_report(y_true2, y_pred2, digits=3))
    print(confusion_matrix(y_true2, y_pred2))    
    
    not_embd = model.roberta.embeddings.word_embeddings.weight.data[[tokenizer.vocab['[NOT]']]] # .cpu().detach().numpy())
    few_embd = model.roberta.embeddings.word_embeddings.weight.data[[tokenizer.vocab['[FEW]']]]
    many_embd = model.roberta.embeddings.word_embeddings.weight.data[[tokenizer.vocab['[MANY]']]]

    np.savetxt(f'embd.not.xlmr.{run:02}.txt', not_embd.cpu().detach().numpy())
    np.savetxt(f'embd.few.xlmr.{run:02}.txt', few_embd.cpu().detach().numpy())
    np.savetxt(f'embd.many.xlmr.{run:02}.txt', many_embd.cpu().detach().numpy())

=== 0


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6749,0.611844
64,0.5816,0.504875
96,0.4149,0.332611
128,0.3348,0.286006
160,0.3228,0.298301
192,0.307,0.288747
224,0.3094,0.296096


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-160] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.824     0.949     0.882       930
           1      0.839     0.838     0.839       710
           2      0.998     0.813     0.896       760

    accuracy                          0.873      2400
   macro avg      0.887     0.867     0.872      2400
weighted avg      0.884     0.873     0.874      2400

[[883  47   0]
 [114 595   1]
 [ 75  67 618]]
=== 1


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6493,0.5633
64,0.5872,0.493425
96,0.4283,0.363782
128,0.3358,0.317266
160,0.2975,0.307298
192,0.3051,0.281547
224,0.3053,0.312211
256,0.2983,0.283001
288,0.2816,0.283099


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-128] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-224] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.873     0.896     0.884       930
           1      0.795     0.921     0.853       710
           2      1.000     0.820     0.901       760

    accuracy                          0.879      2400
   macro avg      0.889     0.879     0.879      2400
weighted avg      0.890     0.879     0.880      2400

[[833  97   0]
 [ 56 654   0]
 [ 65  72 623]]
=== 2


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6801,0.561477
64,0.5436,0.389002
96,0.39,0.384404
128,0.3599,0.352104
160,0.3331,0.309867
192,0.2904,0.304017
224,0.3237,0.300108
256,0.3055,0.302374
288,0.3104,0.285573
320,0.2737,0.272081


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-192] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.881     0.900     0.890       930
           1      0.833     0.921     0.875       710
           2      0.998     0.874     0.932       760

    accuracy                          0.898      2400
   macro avg      0.904     0.898     0.899      2400
weighted avg      0.904     0.898     0.899      2400

[[837  92   1]
 [ 56 654   0]
 [ 57  39 664]]
=== 3


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6494,0.553621
64,0.5796,0.446725
96,0.4278,0.337199
128,0.353,0.319624
160,0.3201,0.287654
192,0.3217,0.301104
224,0.3153,0.289547
256,0.2976,0.301638


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-544] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.839     0.911     0.874       930
           1      0.804     0.870     0.836       710
           2      1.000     0.818     0.900       760

    accuracy                          0.870      2400
   macro avg      0.881     0.867     0.870      2400
weighted avg      0.880     0.870     0.871      2400

[[847  83   0]
 [ 92 618   0]
 [ 70  68 622]]
=== 4


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6453,0.581974
64,0.5063,0.384228
96,0.3789,0.395785
128,0.347,0.322302
160,0.3395,0.307622
192,0.3185,0.356715
224,0.3138,0.305237
256,0.2927,0.320274
288,0.2898,0.304633
320,0.2893,0.305694


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-160] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.910     0.845     0.876       930
           1      0.787     0.976     0.872       710
           2      0.998     0.862     0.925       760

    accuracy                          0.889      2400
   macro avg      0.899     0.894     0.891      2400
weighted avg      0.902     0.889     0.890      2400

[[786 144   0]
 [ 16 693   1]
 [ 62  43 655]]
=== 5


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6448,0.544878
64,0.5624,0.469915
96,0.3967,0.327815
128,0.3385,0.305004
160,0.3281,0.304643
192,0.3113,0.30904
224,0.3158,0.290654
256,0.3026,0.307215
288,0.3006,0.284343
320,0.293,0.33935


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoints2-xlmr-mr/checkpoint-96/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.901     0.870     0.885       930
           1      0.791     0.963     0.869       710
           2      1.000     0.838     0.912       760

    accuracy                          0.887      2400
   macro avg      0.897     0.890     0.889      2400
weighted avg      0.900     0.887     0.889      2400

[[809 121   0]
 [ 26 684   0]
 [ 63  60 637]]
=== 6


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6587,0.552695
64,0.5662,0.436694
96,0.4002,0.306241
128,0.3315,0.330227
160,0.3195,0.291994
192,0.299,0.287323
224,0.3033,0.288442
256,0.2949,0.367933
288,0.2958,0.278096
320,0.2956,0.289362


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-512] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.810     0.989     0.891       930
           1      0.889     0.804     0.845       710
           2      1.000     0.818     0.900       760

    accuracy                          0.880      2400
   macro avg      0.900     0.871     0.878      2400
weighted avg      0.894     0.880     0.880      2400

[[920  10   0]
 [139 571   0]
 [ 77  61 622]]
=== 7


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6631,0.574417
64,0.5997,0.474865
96,0.4157,0.365645
128,0.3482,0.295695
160,0.315,0.294375
192,0.3062,0.289612
224,0.3098,0.293854
256,0.2975,0.307688
288,0.2788,0.294859


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-288] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.902     0.855     0.878       930
           1      0.770     0.969     0.858       710
           2      0.997     0.821     0.900       760

    accuracy                          0.878      2400
   macro avg      0.890     0.882     0.879      2400
weighted avg      0.893     0.878     0.879      2400

[[795 133   2]
 [ 22 688   0]
 [ 64  72 624]]
=== 8


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6497,0.567145
64,0.5994,0.538261
96,0.4634,0.392141
128,0.3491,0.313771
160,0.3362,0.304046
192,0.314,0.292961
224,0.3189,0.288316
256,0.3006,0.279418
288,0.2993,0.283258
320,0.289,0.288058


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-192] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.808     0.996     0.892       930
           1      0.947     0.799     0.866       710
           2      1.000     0.862     0.926       760

    accuracy                          0.895      2400
   macro avg      0.918     0.885     0.895      2400
weighted avg      0.910     0.895     0.895      2400

[[926   4   0]
 [143 567   0]
 [ 77  28 655]]
=== 9


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6971,0.573478
64,0.5695,0.444769
96,0.3872,0.347294
128,0.3435,0.334711
160,0.309,0.291434
192,0.2955,0.285594
224,0.3059,0.280634
256,0.2935,0.298517
288,0.2914,0.275886
320,0.2972,0.286649


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-480] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.878     0.909     0.893       930
           1      0.837     0.908     0.871       710
           2      1.000     0.878     0.935       760

    accuracy                          0.899      2400
   macro avg      0.905     0.898     0.900      2400
weighted avg      0.905     0.899     0.900      2400

[[845  85   0]
 [ 65 645   0]
 [ 52  41 667]]
=== 10


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6615,0.561063
64,0.5848,0.544015
96,0.4649,0.334429
128,0.3386,0.294773
160,0.3261,0.292613
192,0.3131,0.285395
224,0.2867,0.278058
256,0.2948,0.28548
288,0.2909,0.275199
320,0.2962,0.275276


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoints2-xlmr-mr/checkpoint-96/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.810     0.984     0.888       930
           1      0.931     0.803     0.862       710
           2      1.000     0.866     0.928       760

    accuracy                          0.893      2400
   macro avg      0.914     0.884     0.893      2400
weighted avg      0.906     0.893     0.893      2400

[[915  15   0]
 [140 570   0]
 [ 75  27 658]]
=== 11


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6521,0.56079
64,0.5627,0.391489
96,0.3605,0.323639
128,0.3315,0.32458
160,0.3304,0.294962
192,0.2883,0.282401
224,0.2912,0.285197
256,0.3037,0.282153
288,0.3051,0.284206
320,0.2709,0.273352


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-448] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-544] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.902     0.867     0.884       930
           1      0.794     0.966     0.872       710
           2      0.998     0.843     0.914       760

    accuracy                          0.889      2400
   macro avg      0.898     0.892     0.890      2400
weighted avg      0.900     0.889     0.890      2400

[[806 123   1]
 [ 24 686   0]
 [ 64  55 641]]
=== 12


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6588,0.585656
64,0.5419,0.395839
96,0.3864,0.314577
128,0.3368,0.298923
160,0.3078,0.289032
192,0.3137,0.294787
224,0.3097,0.283501
256,0.3107,0.312226
288,0.31,0.294133
320,0.3031,0.307167


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoints2-xlmr-mr/checkpoint-96/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.821     0.965     0.887       930
           1      0.857     0.830     0.843       710
           2      0.997     0.814     0.896       760

    accuracy                          0.877      2400
   macro avg      0.892     0.870     0.876      2400
weighted avg      0.888     0.877     0.877      2400

[[897  31   2]
 [121 589   0]
 [ 74  67 619]]
=== 13


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6815,0.543392
64,0.5425,0.468254
96,0.4115,0.311522
128,0.321,0.306015
160,0.3282,0.307631
192,0.3086,0.310182
224,0.3206,0.319086


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-224] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-320] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.870     0.889     0.879       930
           1      0.781     0.917     0.843       710
           2      0.995     0.805     0.890       760

    accuracy                          0.871      2400
   macro avg      0.882     0.870     0.871      2400
weighted avg      0.883     0.871     0.872      2400

[[827 102   1]
 [ 57 651   2]
 [ 67  81 612]]
=== 14


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6658,0.570557
64,0.563,0.429299
96,0.3972,0.303637
128,0.3131,0.309021
160,0.3013,0.289322
192,0.2943,0.296641
224,0.2806,0.287037
256,0.3049,0.277308
288,0.2888,0.275571
320,0.2806,0.268541


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-128] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-224] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.855     0.915     0.884       930
           1      0.844     0.894     0.869       710
           2      1.000     0.859     0.924       760

    accuracy                          0.891      2400
   macro avg      0.900     0.890     0.892      2400
weighted avg      0.898     0.891     0.892      2400

[[851  79   0]
 [ 75 635   0]
 [ 69  38 653]]
=== 15


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6617,0.574034
64,0.5398,0.378323
96,0.3784,0.326337
128,0.3297,0.301332
160,0.3141,0.2922
192,0.3191,0.288852
224,0.2856,0.307343
256,0.3027,0.305105
288,0.3129,0.282263
320,0.3036,0.329936


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoints2-xlmr-mr/checkpoint-96/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.898     0.858     0.877       930
           1      0.780     0.955     0.859       710
           2      0.992     0.838     0.909       760

    accuracy                          0.880      2400
   macro avg      0.890     0.884     0.882      2400
weighted avg      0.893     0.880     0.882      2400

[[798 132   0]
 [ 27 678   5]
 [ 64  59 637]]
=== 16


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6792,0.568061
64,0.5413,0.447667
96,0.3937,0.305667
128,0.3477,0.30817
160,0.3136,0.332137
192,0.299,0.295904
224,0.3154,0.294811
256,0.3014,0.276301
288,0.2998,0.278149
320,0.2789,0.289672


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoints2-xlmr-mr/checkpoint-96/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.802     0.995     0.888       930
           1      0.906     0.785     0.841       710
           2      0.992     0.824     0.900       760

    accuracy                          0.878      2400
   macro avg      0.900     0.868     0.876      2400
weighted avg      0.893     0.878     0.878      2400

[[925   3   2]
 [150 557   3]
 [ 79  55 626]]
=== 17


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6708,0.557689
64,0.5514,0.382609
96,0.3924,0.32684
128,0.3333,0.336934
160,0.3084,0.30094
192,0.3049,0.279699
224,0.29,0.278612
256,0.3047,0.289431
288,0.3026,0.292265
320,0.2811,0.274527


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-256] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-352] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.803     0.996     0.889       930
           1      0.917     0.794     0.851       710
           2      1.000     0.832     0.908       760

    accuracy                          0.884      2400
   macro avg      0.907     0.874     0.883      2400
weighted avg      0.899     0.884     0.884      2400

[[926   4   0]
 [146 564   0]
 [ 81  47 632]]
=== 18


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6729,0.566564
64,0.6012,0.499637
96,0.4184,0.30791
128,0.3495,0.299934
160,0.3127,0.290145
192,0.3058,0.298411
224,0.2972,0.292125
256,0.2929,0.283419
288,0.2847,0.294209
320,0.2996,0.289736


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-384] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-480] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.916     0.844     0.879       930
           1      0.788     0.985     0.875       710
           2      1.000     0.863     0.927       760

    accuracy                          0.892      2400
   macro avg      0.901     0.897     0.894      2400
weighted avg      0.905     0.892     0.893      2400

[[785 145   0]
 [ 11 699   0]
 [ 61  43 656]]
=== 19


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6778,0.612598
64,0.6041,0.513536
96,0.4347,0.308489
128,0.3049,0.291397
160,0.318,0.283657
192,0.2905,0.301376
224,0.2947,0.28934
256,0.2925,0.28536


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-544] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.863     0.930     0.895       930
           1      0.823     0.901     0.860       710
           2      1.000     0.816     0.899       760

    accuracy                          0.885      2400
   macro avg      0.895     0.882     0.885      2400
weighted avg      0.895     0.885     0.886      2400

[[865  65   0]
 [ 70 640   0]
 [ 67  73 620]]
=== 20


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6467,0.559476
64,0.565,0.4599
96,0.4081,0.354897
128,0.347,0.303312
160,0.3092,0.288694
192,0.3144,0.299732
224,0.3052,0.286945
256,0.3169,0.282437
288,0.271,0.277132
320,0.2693,0.277664


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-160] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.914     0.832     0.871       930
           1      0.778     0.977     0.866       710
           2      1.000     0.870     0.930       760

    accuracy                          0.887      2400
   macro avg      0.897     0.893     0.889      2400
weighted avg      0.901     0.887     0.888      2400

[[774 156   0]
 [ 16 694   0]
 [ 57  42 661]]
=== 21


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.654,0.712323
64,0.5639,0.492532
96,0.4168,0.315638
128,0.3359,0.314602
160,0.3265,0.295029
192,0.3002,0.27817
224,0.3184,0.294191
256,0.3058,0.293926
288,0.2919,0.290067


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoints2-xlmr-mr/checkpoint-96/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.886     0.862     0.874       930
           1      0.770     0.945     0.849       710
           2      0.995     0.817     0.897       760

    accuracy                          0.873      2400
   macro avg      0.884     0.875     0.873      2400
weighted avg      0.886     0.873     0.874      2400

[[802 127   1]
 [ 37 671   2]
 [ 66  73 621]]
=== 22


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6651,0.57463
64,0.5788,0.503523
96,0.4459,0.335807
128,0.3585,0.300465
160,0.3167,0.309473
192,0.3243,0.298001
224,0.2795,0.280431
256,0.292,0.277809
288,0.2949,0.269583
320,0.2565,0.276078


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-192] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.936     0.822     0.875       930
           1      0.775     0.992     0.870       710
           2      0.991     0.882     0.933       760

    accuracy                          0.891      2400
   macro avg      0.901     0.898     0.893      2400
weighted avg      0.906     0.891     0.892      2400

[[764 160   6]
 [  6 704   0]
 [ 46  44 670]]
=== 23


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6505,0.591025
64,0.6098,0.525015
96,0.501,0.346521
128,0.3556,0.334163
160,0.3388,0.298312
192,0.3102,0.282972
224,0.2978,0.344948
256,0.292,0.266616
288,0.2892,0.27914
320,0.2727,0.292363


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoints2-xlmr-mr/checkpoint-96/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.811     0.985     0.889       930
           1      0.930     0.807     0.864       710
           2      1.000     0.861     0.925       760

    accuracy                          0.893      2400
   macro avg      0.914     0.884     0.893      2400
weighted avg      0.906     0.893     0.893      2400

[[916  14   0]
 [137 573   0]
 [ 77  29 654]]
=== 24


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6268,0.562563
64,0.5626,0.398677
96,0.3603,0.314293
128,0.3235,0.283137
160,0.3075,0.289964
192,0.3071,0.285585
224,0.2873,0.277197
256,0.31,0.292219
288,0.284,0.277877
320,0.2893,0.269716


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-416] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-512] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.929     0.847     0.886       930
           1      0.794     0.982     0.878       710
           2      0.996     0.883     0.936       760

    accuracy                          0.898      2400
   macro avg      0.906     0.904     0.900      2400
weighted avg      0.910     0.898     0.900      2400

[[788 139   3]
 [ 13 697   0]
 [ 47  42 671]]
=== 25


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6379,0.581916
64,0.5843,0.527453
96,0.4601,0.311947
128,0.3322,0.315662
160,0.3249,0.29715
192,0.3016,0.315281
224,0.305,0.295403
256,0.3039,0.280424
288,0.2789,0.297451
320,0.2798,0.306409


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-480] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.816     0.989     0.895       930
           1      0.937     0.811     0.869       710
           2      1.000     0.866     0.928       760

    accuracy                          0.897      2400
   macro avg      0.918     0.889     0.897      2400
weighted avg      0.910     0.897     0.898      2400

[[920  10   0]
 [134 576   0]
 [ 73  29 658]]
=== 26


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6785,0.579159
64,0.5978,0.514366
96,0.4038,0.319483
128,0.331,0.291635
160,0.328,0.291046
192,0.2861,0.308114
224,0.2929,0.281224
256,0.31,0.304418
288,0.2942,0.27868
320,0.307,0.284896


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-480] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.873     0.902     0.887       930
           1      0.801     0.921     0.857       710
           2      0.998     0.818     0.899       760

    accuracy                          0.881      2400
   macro avg      0.891     0.881     0.881      2400
weighted avg      0.892     0.881     0.882      2400

[[839  90   1]
 [ 56 654   0]
 [ 66  72 622]]
=== 27


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6731,0.563616
64,0.5485,0.376639
96,0.3779,0.316678
128,0.3199,0.298435
160,0.3167,0.301671
192,0.3018,0.28021
224,0.2991,0.283612
256,0.3083,0.284474
288,0.2819,0.270084
320,0.2788,0.274974


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-288] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-384] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.944     0.822     0.879       930
           1      0.772     0.999     0.871       710
           2      1.000     0.886     0.939       760

    accuracy                          0.894      2400
   macro avg      0.906     0.902     0.896      2400
weighted avg      0.911     0.894     0.896      2400

[[764 166   0]
 [  1 709   0]
 [ 44  43 673]]
=== 28


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6751,0.554196
64,0.5624,0.527205
96,0.4816,0.385044
128,0.3597,0.306675
160,0.3227,0.288761
192,0.292,0.295932
224,0.3041,0.318113
256,0.3143,0.313276


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoints2-xlmr-mr/checkpoint-96/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.896     0.849     0.872       930
           1      0.756     0.961     0.846       710
           2      1.000     0.811     0.895       760

    accuracy                          0.870      2400
   macro avg      0.884     0.874     0.871      2400
weighted avg      0.887     0.870     0.872      2400

[[790 140   0]
 [ 28 682   0]
 [ 64  80 616]]
=== 29


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6475,0.537468
64,0.5734,0.451263
96,0.3963,0.333506
128,0.3551,0.294342
160,0.3351,0.316676
192,0.3159,0.285732
224,0.3086,0.291306
256,0.295,0.312331
288,0.2708,0.286335


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-160] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-256] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.839     0.951     0.891       930
           1      0.844     0.861     0.852       710
           2      0.997     0.816     0.897       760

    accuracy                          0.881      2400
   macro avg      0.893     0.876     0.880      2400
weighted avg      0.890     0.881     0.882      2400

[[884  46   0]
 [ 97 611   2]
 [ 73  67 620]]
=== 30


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6419,0.624653
64,0.5693,0.461525
96,0.3976,0.343815
128,0.3588,0.306233
160,0.3081,0.280553
192,0.2956,0.333829
224,0.296,0.296202
256,0.288,0.276606
288,0.2757,0.266942
320,0.2688,0.271317


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-192] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-288] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.908     0.856     0.881       930
           1      0.798     0.963     0.873       710
           2      1.000     0.876     0.934       760

    accuracy                          0.894      2400
   macro avg      0.902     0.899     0.896      2400
weighted avg      0.904     0.894     0.895      2400

[[796 134   0]
 [ 26 684   0]
 [ 55  39 666]]
=== 31


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6465,0.555126
64,0.5634,0.393508
96,0.3827,0.301812
128,0.329,0.310683
160,0.2913,0.293434
192,0.3106,0.305496
224,0.3081,0.270221
256,0.2719,0.277028
288,0.272,0.267026
320,0.2595,0.271094


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-544] due to args.save_total_limit
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-64
Configuration saved in checkpoints2-xlmr-mr/checkpoint-64/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-64/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-32] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-96
Configuration saved in checkpoi

0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0      0.898     0.858     0.877       930
           1      0.796     0.955     0.868       710
           2      1.000     0.867     0.929       760

    accuracy                          0.890      2400
   macro avg      0.898     0.893     0.891      2400
weighted avg      0.900     0.890     0.891      2400

[[798 132   0]
 [ 32 678   0]
 [ 59  42 659]]
=== 32


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-large/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/4d7a1550c9ab8701667bc307a1213c040fcc06dc87a5e4994e72aecc0d7e0337.302e267433fe7c84959a639e9c7c555043daa4020c0daf311785b53de7b8685e
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size"

['<s>', '▁no', '▁big', '▁prostitu', 'te', '[NOT]', '▁brush', 'es', '▁no', '▁flori']
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 19200
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 600


Step,Training Loss,Validation Loss
32,0.6574,0.564354


***** Running Evaluation *****
  Num examples = 2400
  Batch size = 32
Saving model checkpoint to checkpoints2-xlmr-mr/checkpoint-32
Configuration saved in checkpoints2-xlmr-mr/checkpoint-32/config.json
Model weights saved in checkpoints2-xlmr-mr/checkpoint-32/pytorch_model.bin
Deleting older checkpoint [checkpoints2-xlmr-mr/checkpoint-576] due to args.save_total_limit
