In [1]:
!pip install transformers[sentencepiece] tokenizers datasets seqeval



In [2]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoTokenizer, DataCollatorWithPadding, DataCollatorForTokenClassification, TrainingArguments, Trainer
import os
import shutil
from tqdm.auto import tqdm
import collections

checkpoint = "google/electra-small-discriminator"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [3]:
label_list = load_dataset("conll2003")["train"].features["chunk_tags"].feature.names

Reusing dataset conll2003 (/home/ap_default/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
fn_dataset = load_dataset("csv",
                          data_files={'train': 'fn_train.tsv',
                                      'test': 'fn_test.tsv',
                                      'full': 'fn_full.tsv'},
                          skiprows=1,
                          column_names=['idx', 'label', 'sentence'],
                          delimiter="\t")
fn_dataset = fn_dataset.class_encode_column('label')
print(fn_dataset)
print(fn_dataset['train'].features)
print(fn_dataset['test'].features)
print(fn_dataset['train'][1])

npc_dataset = load_dataset("csv",
                           data_files={'train': 'npc_train.tsv',
                                       'test': 'npc_test.tsv',
                                       'full': 'npc_full.tsv'},
                           skiprows=1,
                           column_names=['idx', 'label', 'sentence'],
                           delimiter="\t")
npc_dataset = npc_dataset.class_encode_column('label')
print(npc_dataset)
print(npc_dataset['train'].features)
print(npc_dataset['test'].features)
print(npc_dataset['train'][1])

vn_dataset = load_dataset("csv",
                          data_files={'train': 'vn_train.tsv',
                                      'test': 'vn_test.tsv',
                                      'full': 'vn_full.tsv'},
                          skiprows=1,
                          column_names=['idx', 'label', 'sentence'],
                          delimiter="\t")
vn_dataset = vn_dataset.class_encode_column('label')
print(vn_dataset)
print(vn_dataset['train'].features)
print(vn_dataset['test'].features)
print(vn_dataset['train'][1])

wn_dataset = load_dataset("csv",
                          data_files={'train': 'wn_train.tsv',
                                      'test': 'wn_test.tsv',
                                      'full': 'wn_full.tsv'},
                          skiprows=1,
                          column_names=['idx', 'label', 'sentence'],
                          delimiter="\t")
wn_dataset = wn_dataset.class_encode_column('label')
print(wn_dataset)
print(wn_dataset['train'].features)
print(wn_dataset['test'].features)
print(wn_dataset['full'].features)
print(wn_dataset['train'][1])

Using custom data configuration default-7f8c5f914cfc28ef
Reusing dataset csv (/home/ap_default/.cache/huggingface/datasets/csv/default-7f8c5f914cfc28ef/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7f8c5f914cfc28ef/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-d139fe2704a766fb.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7f8c5f914cfc28ef/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-7fea2cb9ec57fcc6.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7f8c5f914cfc28ef/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-6e16171400876ed3.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7f8c5f914cfc28ef/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-c129e8c0ca276c9c.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7f8c5f914cfc28ef/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a2

DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 3329
    })
    test: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 820
    })
    full: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 4149
    })
})
{'idx': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=54, names=['112', '1132', '1210', '1262', '139', '1390', '16', '160', '1688', '17', '171', '197', '2135', '2161', '254', '263', '264', '265', '273', '276', '279', '2824', '283', '289', '293', '301', '31', '352', '395', '40', '41', '410', '414', '416', '42', '424', '43', '54', '55', '56', '59', '590', '6', '62', '63', '64', '65', '652', '66', '683', '7', '770', '801', 'unknown'], id=None), 'sentence': Value(dtype='string', id=None)}
{'idx': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=54, names=['112', '1132', '1210', '1262', '139', '1390', '16', '160', '1688', '17', '171', '197', '2135', '2161

Using custom data configuration default-b98ff5898fa6ba87
Reusing dataset csv (/home/ap_default/.cache/huggingface/datasets/csv/default-b98ff5898fa6ba87/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-b98ff5898fa6ba87/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-5e4ea063706b6b77.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-b98ff5898fa6ba87/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-daae674e05534f84.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-b98ff5898fa6ba87/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-44db151f32c55575.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-b98ff5898fa6ba87/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-9e357abff7ecfa54.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-b98ff5898fa6ba87/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a2

DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 3327
    })
    test: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 832
    })
    full: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 4159
    })
})
{'idx': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=3, names=['0', '1', 'unknown'], id=None), 'sentence': Value(dtype='string', id=None)}
{'idx': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=3, names=['0', '1', 'unknown'], id=None), 'sentence': Value(dtype='string', id=None)}
{'idx': 1, 'label': 0, 'sentence': 'beetlebaum'}


Using custom data configuration default-7581f73791510896
Reusing dataset csv (/home/ap_default/.cache/huggingface/datasets/csv/default-7581f73791510896/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7581f73791510896/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-34a4bd749b78cf59.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7581f73791510896/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-bea5bd4198cf5603.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7581f73791510896/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-78ba92e3d9b3080a.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7581f73791510896/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-4461d705c5909d1c.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-7581f73791510896/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a2

DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 4718
    })
    test: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 1154
    })
    full: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 5872
    })
})
{'idx': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=70, names=['appear-48.1.1', 'assuming_position-50', 'banish-10.2', 'body_internal_states-40.6', 'break-45.1', 'bump-18.4', 'carry-11.4', 'carry-11.4-1-1', 'carve-21.2-2', 'chase-51.6', 'crane-40.3.2', 'cut-21.1-1', 'disassemble-23.3', 'eat-39.1-1', 'eat-39.1-2', 'escape-51.1-1', 'exist-47.1-1', 'feeding-39.7', 'fill-9.8', 'fulfilling-13.4.1', 'funnel-9.3-2-1', 'get-13.5.1', 'give-13.1-1', 'hit-18.1-1', 'hurt-40.8.3-2', 'investigate-35.4', 'knead-26.5', 'learn-14-1', 'learn-14-2-1', 'long-32.2-1', 'manner_speaking-37.3', 'modes_of_being_with_motion-47.3', 'murder-42.1-1', 'other_cos-45.4', 'peer-30.3', 'perform

Using custom data configuration default-dc77ba31d5024efe
Reusing dataset csv (/home/ap_default/.cache/huggingface/datasets/csv/default-dc77ba31d5024efe/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-dc77ba31d5024efe/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-1e39f50f01b28e67.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-dc77ba31d5024efe/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-8f2e5c046e0522f9.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-dc77ba31d5024efe/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-db29089dda3c7575.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-dc77ba31d5024efe/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-95e5e8c8172e7755.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/csv/default-dc77ba31d5024efe/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a2

DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 3138
    })
    test: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 758
    })
    full: Dataset({
        features: ['idx', 'label', 'sentence'],
        num_rows: 3896
    })
})
{'idx': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=84, names=['answer.v.01', 'ask.v.01', 'ask.v.02', 'blow.v.01', 'brandish.v.01', 'break.v.05', 'burn.v.01', 'buy.v.01', 'charge.v.17', 'choose.v.01', 'clean.v.01', 'climb.v.01', 'close.v.01', 'connect.v.01', 'consult.v.02', 'cut.v.01', 'dig.v.01', 'drink.v.01', 'drive.v.01', 'drop.v.01', 'eat.v.01', 'enter.v.01', 'examine.v.02', 'exit.v.01', 'fill.v.01', 'follow.v.01', 'give.v.03', 'hit.v.02', 'hit.v.03', 'insert.v.01', 'insert.v.02', 'inventory.v.01', 'jump.v.01', 'kill.v.01', 'lie_down.v.01', 'light_up.v.05', 'listen.v.01', 'look.v.01', 'lower.v.01', 'memorize.v.01', 'move.v.02', 'note.v.04', 'open.v.01', 'pla

In [5]:
fn_label2id = {}
fn_id2label = {}
for i, l in enumerate(fn_dataset['train'].features['label'].names):
    fn_label2id[l] = i
    fn_id2label[i] = l
print(fn_label2id)
print(fn_id2label)

npc_label2id = {}
npc_id2label = {}
for i, l in enumerate(npc_dataset['train'].features['label'].names):
    npc_label2id[l] = i
    npc_id2label[i] = l
print(npc_label2id)
print(npc_id2label)

vn_label2id = {}
vn_id2label = {}
for i, l in enumerate(vn_dataset['train'].features['label'].names):
    vn_label2id[l] = i
    vn_id2label[i] = l
print(vn_label2id)
print(vn_id2label)

wn_label2id = {}
wn_id2label = {}
for i, l in enumerate(wn_dataset['train'].features['label'].names):
    wn_label2id[l] = i
    wn_id2label[i] = l
print(wn_label2id)
print(wn_id2label)

{'112': 0, '1132': 1, '1210': 2, '1262': 3, '139': 4, '1390': 5, '16': 6, '160': 7, '1688': 8, '17': 9, '171': 10, '197': 11, '2135': 12, '2161': 13, '254': 14, '263': 15, '264': 16, '265': 17, '273': 18, '276': 19, '279': 20, '2824': 21, '283': 22, '289': 23, '293': 24, '301': 25, '31': 26, '352': 27, '395': 28, '40': 29, '41': 30, '410': 31, '414': 32, '416': 33, '42': 34, '424': 35, '43': 36, '54': 37, '55': 38, '56': 39, '59': 40, '590': 41, '6': 42, '62': 43, '63': 44, '64': 45, '65': 46, '652': 47, '66': 48, '683': 49, '7': 50, '770': 51, '801': 52, 'unknown': 53}
{0: '112', 1: '1132', 2: '1210', 3: '1262', 4: '139', 5: '1390', 6: '16', 7: '160', 8: '1688', 9: '17', 10: '171', 11: '197', 12: '2135', 13: '2161', 14: '254', 15: '263', 16: '264', 17: '265', 18: '273', 19: '276', 20: '279', 21: '2824', 22: '283', 23: '289', 24: '293', 25: '301', 26: '31', 27: '352', 28: '395', 29: '40', 30: '41', 31: '410', 32: '414', 33: '416', 34: '42', 35: '424', 36: '43', 37: '54', 38: '55', 39: 

In [6]:
# GLUE: https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb
GLUE_TASKS_1 = [  # AutoModelForSequenceClassification
    ("cola", 2, None),
]

# QA: https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb
SQUAD_TASKS = [ # AutoModelForQuestionAnswering
    ("squad_v2", None, "cola")
]

GLUE_TASKS_2 = [  # AutoModelForSequenceClassification
    ("sst2", 2, "squad_v2"),
    ("qqp", 2, "sst2"),
    ("rte", 2, "qqp"),
]

# Token Classification: https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb
CHUCK_TASKS = [ # AutoModelForTokenClassification
    ("chunk", len(label_list), "rte")
]

GLUE_TASKS_3 = [  # AutoModelForSequenceClassification
    ("mrpc", 2, "chunk"),
    ("stsb", 1, "mrpc"),
    ("wnli", 2, "stsb"),
    ("mnli", 3, "wnli"),
]

JERICHO_TASKS = [  # AutoModelForSequenceClassification
    ("npc", 3, "mnli"),
    ("fn", 54, "npc"),
    ("vn", 70, "fn"),
    ("wn", 84, "vn")
]

JERICHO_TASKS_FULL = [  # AutoModelForSequenceClassification
    ("npc_full", 3, "mnli"),
    ("fn_full", 54, "npc_full"),
    ("vn_full", 70, "fn_full"),
    ("wn_full", 84, "vn_full")
]

In [7]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [8]:
#for task, num_labels, prev in GLUE_TASKS_1 + SQUAD_TASKS + GLUE_TASKS_2 + CHUCK_TASKS + GLUE_TASKS_3 + JERICHO_TASKS:
def run(task, num_labels, prev):
    if task == "squad_v2":
        model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")
    elif task == "chunk":
        model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=num_labels)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")
    elif task == "fn":
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, label2id=fn_label2id, id2label=fn_id2label)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")
    elif task == "npc":
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, label2id=npc_label2id, id2label=npc_id2label)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")
    elif task == "vn":
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, label2id=vn_label2id, id2label=vn_id2label)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")
    elif task == "wn":
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, label2id=wn_label2id, id2label=wn_id2label)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")
    else:
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")

    # print(model)
    if task == "squad_v2":
        raw_datasets = load_dataset("squad_v2")
    elif task == "chunk":
        raw_datasets = load_dataset("conll2003")
    elif task == "fn":
        raw_datasets = fn_dataset
    elif task == "npc":
        raw_datasets = npc_dataset
    elif task == "vn":
        raw_datasets = vn_dataset
    elif task == "wn":
        raw_datasets = wn_dataset
    else:
        raw_datasets = load_dataset("glue", task)


    # print(raw_datasets)

    def tokenize_function(example):
        if task == "cola" or task == "sst2":
            return tokenizer(example["sentence"], truncation=True, stride=128)
        if task == "qqp":
            return tokenizer(example["question1"], example["question2"], truncation=True, stride=128)
        if task == "rte" or task == "mrpc" or task == "stsb" or task == "wnli":
            return tokenizer(example["sentence1"], example["sentence2"], truncation=True, stride=128)
        if task == "mnli":
            return tokenizer(example["premise"], example["hypothesis"], truncation=True, stride=128)
        if task == "chunk":
            tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, stride=128)

            labels = []
            for i, label in enumerate(example["chunk_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                label_ids = []
                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    else:
                        label_ids.append(label[word_idx])

                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs
        if task == "squad_v2":
            tokenized_examples = tokenizer(
                example["question" if tokenizer.padding_side == "right" else "context"],
                example["context" if tokenizer.padding_side == "right" else "question"],
                truncation="only_second" if tokenizer.padding_side == "right" else "only_first",
                stride=128,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                padding="max_length"
            )

            sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
            offset_mapping = tokenized_examples.pop("offset_mapping")

            tokenized_examples["start_positions"] = []
            tokenized_examples["end_positions"] = []

            for i, offsets in enumerate(offset_mapping):
                input_ids = tokenized_examples["input_ids"][i]
                cls_index = input_ids.index(tokenizer.cls_token_id)

                sequence_ids = tokenized_examples.sequence_ids(i)

                sample_index = sample_mapping[i]
                answers = example["answers"][sample_index]
                if len(answers["answer_start"]) == 0:
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    start_char = answers["answer_start"][0]
                    end_char = start_char + len(answers["text"][0])

                    token_start_index = 0
                    while sequence_ids[token_start_index] != (1 if tokenizer.padding_side == "right" else 0):
                        token_start_index += 1

                    token_end_index = len(input_ids) - 1
                    while sequence_ids[token_end_index] != (1 if tokenizer.padding_side == "right" else 0):
                        token_end_index -= 1

                    if not (
                            offsets[token_start_index][0] <= start_char
                            and offsets[token_end_index][1] >= end_char
                    ):
                        tokenized_examples["start_positions"].append(cls_index)
                        tokenized_examples["end_positions"].append(cls_index)
                    else:
                        while (
                                token_start_index < len(offsets)
                                and offsets[token_start_index][0] <= start_char
                        ):
                            token_start_index += 1
                        tokenized_examples["start_positions"].append(token_start_index - 1)
                        while offsets[token_end_index][1] >= end_char:
                            token_end_index -= 1
                        tokenized_examples["end_positions"].append(token_end_index + 1)

            return tokenized_examples
        return tokenizer(example["sentence"], truncation=True)


    if task == "chunk":
        data_collator = DataCollatorForTokenClassification(tokenizer)
    else:
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=raw_datasets[
        "train"].column_names if task == "squad_v2" else None)
    
    # print(tokenized_datasets['train'][3])
    
    training_args = TrainingArguments(f"{task}-trainer",
                                      overwrite_output_dir=True,
                                      optim="adamw_torch",
                                      learning_rate=1e-4,
                                      weight_decay=0.01,
                                      warmup_ratio=0.1,
                                      adam_epsilon=1e-6,
                                      num_train_epochs=10.0 if task == "npc" or task == "vn" or task == "wn" or task == "fn" else 7.0,
                                      save_strategy="epoch",
                                      evaluation_strategy="epoch",
                                      
                                      # Debug
                                      #save_steps=2,
                                      #eval_steps =2,
                                      #save_strategy="steps",
                                      #evaluation_strategy="steps",
                                      #max_steps=4,
                                      
                                      save_total_limit=1,
                                      load_best_model_at_end=True,
                                      per_device_train_batch_size=32,
                                      per_device_eval_batch_size=32)

    def compute_metrics(eval_preds):
        if task == "squad_v2":
            metric = load_metric("squad_v2")
        elif task == "chunk":
            metric = load_metric("seqeval")
        elif task == "npc" or task == "vn" or task == "wn" or task == "fn":
            metric = load_metric("accuracy")
        else:
            metric = load_metric("glue", task)

        logits, labels = eval_preds
        if task == "stsb":
            predictions = logits[:, 0]
        elif task == "chunk":
            predictions = np.argmax(logits, axis=-1)
        else:
            predictions = np.argmax(logits, axis=-1)
        
        if task == "chunk":
            true_predictions = [
                [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
            true_labels = [
                [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
            results = metric.compute(predictions=true_predictions, references=true_labels)
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }
        else:
            return metric.compute(predictions=predictions, references=labels)


    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation_matched" if task == "mnli" else ("test" if task == "npc" or task == "vn" or task == "wn" or task == "fn" else "validation")],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=None if task == "squad_v2" else compute_metrics
    )

    trainer.train()
    trainer.model.electra.save_pretrained(f"{task}-trainer")
    if prev is not None and prev != "mnli" and os.path.exists(f"{prev}-trainer"):
        try:
            shutil.rmtree(f"{prev}-trainer")
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))

In [9]:
run("cola", 2, None)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-55d00ae1d67fa65d.arrow
Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-6dd882ef945e0019.arrow


  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8551
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1876


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.542559,0.439084
2,0.481300,0.507689,0.496658
3,0.481300,0.504116,0.553419
4,0.261500,0.534901,0.545334
5,0.261500,0.665376,0.563576
6,0.147100,0.733317,0.573205
7,0.147100,0.747631,0.574092


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 32
Saving model checkpoint to cola-trainer/checkpoint-268
Configuration saved in cola-trainer/checkpoint-268/config.json
Model weights saved in cola-trainer/checkpoint-268/pytorch_model.bin
tokenizer config file saved in cola-trainer/checkpoint-268/tokenizer_config.json
Special tokens file saved in cola-trainer/checkpoint-268/special_tokens_map.json
Deleting older checkpoint [cola-trainer/checkpoint-536] due to args.save_total_limit
Deleting older checkpoint [cola-trainer/checkpoint-1876] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward`

In [10]:
run("squad_v2", None, "cola")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tra

  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/ap_default/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-cacda3fb52f40d9a.arrow


  0%|          | 0/12 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 130503
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 28553


Epoch,Training Loss,Validation Loss
1,1.3436,1.358325
2,1.0721,1.194226
3,0.8879,1.136428
4,0.7085,1.176706
5,0.5806,1.319421
6,0.4701,1.549892
7,0.3877,1.620008


***** Running Evaluation *****
  Num examples = 11969
  Batch size = 32
Saving model checkpoint to squad_v2-trainer/checkpoint-4079
Configuration saved in squad_v2-trainer/checkpoint-4079/config.json
Model weights saved in squad_v2-trainer/checkpoint-4079/pytorch_model.bin
tokenizer config file saved in squad_v2-trainer/checkpoint-4079/tokenizer_config.json
Special tokens file saved in squad_v2-trainer/checkpoint-4079/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 11969
  Batch size = 32
Saving model checkpoint to squad_v2-trainer/checkpoint-8158
Configuration saved in squad_v2-trainer/checkpoint-8158/config.json
Model weights saved in squad_v2-trainer/checkpoint-8158/pytorch_model.bin
tokenizer config file saved in squad_v2-trainer/checkpoint-8158/tokenizer_config.json
Special tokens file saved in squad_v2-trainer/checkpoint-8158/special_tokens_map.json
Deleting older checkpoint [squad_v2-trainer/checkpoint-4079] due to args.save_total_limit
***** Running Eval

In [11]:
run("sst2", 2, "squad_v2")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tra

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 67349
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 14735


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2645,0.360923,0.857798
2,0.1824,0.353908,0.891055
3,0.1406,0.391503,0.891055
4,0.1113,0.38693,0.90367
5,0.0831,0.373171,0.897936
6,0.0603,0.428789,0.895642
7,0.0466,0.473749,0.901376


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 32
Saving model checkpoint to sst2-trainer/checkpoint-2105
Configuration saved in sst2-trainer/checkpoint-2105/config.json
Model weights saved in sst2-trainer/checkpoint-2105/pytorch_model.bin
tokenizer config file saved in sst2-trainer/checkpoint-2105/tokenizer_config.json
Special tokens file saved in sst2-trainer/checkpoint-2105/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Runn

In [12]:
run("qqp", 2, "sst2")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tra

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/364 [00:00<?, ?ba/s]

  0%|          | 0/41 [00:00<?, ?ba/s]

  0%|          | 0/391 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, question1, question2. If idx, question1, question2 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 363846
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 79597


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3133,0.292421,0.871655,0.829941
2,0.2584,0.277175,0.881103,0.843231
3,0.2247,0.265185,0.888053,0.85753
4,0.1858,0.266624,0.898269,0.86336
5,0.1459,0.277629,0.903141,0.870597
6,0.1145,0.336603,0.901731,0.871461
7,0.0846,0.394128,0.904081,0.871436


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, question1, question2. If idx, question1, question2 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40430
  Batch size = 32
Saving model checkpoint to qqp-trainer/checkpoint-11371
Configuration saved in qqp-trainer/checkpoint-11371/config.json
Model weights saved in qqp-trainer/checkpoint-11371/pytorch_model.bin
tokenizer config file saved in qqp-trainer/checkpoint-11371/tokenizer_config.json
Special tokens file saved in qqp-trainer/checkpoint-11371/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, question1, question2. If idx, question1, question2 are not expected by `ElectraForSequenceClassification.forward

In [13]:
run("rte", 2, "qqp")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tra

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2490
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 546


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.692772,0.509025
2,No log,0.668118,0.592058
3,No log,0.644537,0.631769
4,No log,0.69866,0.642599
5,No log,0.744542,0.642599
6,No log,0.769668,0.646209
7,0.522600,0.81478,0.631769


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 277
  Batch size = 32
Saving model checkpoint to rte-trainer/checkpoint-78
Configuration saved in rte-trainer/checkpoint-78/config.json
Model weights saved in rte-trainer/checkpoint-78/pytorch_model.bin
tokenizer config file saved in rte-trainer/checkpoint-78/tokenizer_config.json
Special tokens file saved in rte-trainer/checkpoint-78/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can safel

In [14]:
run("chunk", len(label_list), "rte")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: pos_tags, id, tokens, chunk_tags, ner_tags. If pos_tags, id, tokens, chunk_tags, ner_tags are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14042
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3073
Trainer is attempting to log a value of "{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2', 3: 'LABEL_3', 4: 'LABEL_4', 5: 'LAB

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.319847,0.845958,0.858499,0.852183,0.918249
2,1.026400,0.237982,0.885404,0.88815,0.886775,0.938345
3,0.248000,0.222445,0.905267,0.895385,0.900299,0.945256
4,0.181800,0.208001,0.909959,0.898937,0.904415,0.947591
5,0.147900,0.212159,0.911461,0.902393,0.906904,0.948576
6,0.121400,0.204009,0.907739,0.904719,0.906226,0.948735
7,0.109400,0.211576,0.911387,0.904557,0.907959,0.94945


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: pos_tags, id, tokens, chunk_tags, ner_tags. If pos_tags, id, tokens, chunk_tags, ner_tags are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3251
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to chunk-trainer/checkpoint-439
Configuration saved in chunk-trainer/checkpoint-439/config.json
Model weights saved in chunk-trainer/checkpoint-439/pytorch_model.bin
tokenizer config file saved in chunk-trainer/checkpoint-439/tokenizer_config.json
Special tokens file saved in chunk-trainer/checkpoint-439/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: pos_tags, id, tokens, chunk_tags, ner_

In [15]:
run("mrpc", 2, "chunk")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tra

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3668
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 805


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.611106,0.70098,0.820059
2,No log,0.525511,0.745098,0.834921
3,No log,0.500907,0.75,0.827119
4,No log,0.586037,0.776961,0.849587
5,0.431600,0.635375,0.769608,0.842282
6,0.431600,0.749043,0.781863,0.845217
7,0.431600,0.778755,0.786765,0.848168


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 32
Saving model checkpoint to mrpc-trainer/checkpoint-115
Configuration saved in mrpc-trainer/checkpoint-115/config.json
Model weights saved in mrpc-trainer/checkpoint-115/pytorch_model.bin
tokenizer config file saved in mrpc-trainer/checkpoint-115/tokenizer_config.json
Special tokens file saved in mrpc-trainer/checkpoint-115/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you

In [16]:
run("stsb", 1, "mrpc")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_las

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5749
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1260


Epoch,Training Loss,Validation Loss,Pearson,Spearmanr
1,No log,0.652621,0.845683,0.841065
2,No log,0.606566,0.866593,0.864842
3,1.722800,0.599615,0.871527,0.869917
4,1.722800,0.594279,0.87234,0.87254
5,1.722800,0.591404,0.872534,0.871335
6,0.239500,0.5799,0.874185,0.871654
7,0.239500,0.592205,0.874243,0.871459


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1500
  Batch size = 32
Saving model checkpoint to stsb-trainer/checkpoint-180
Configuration saved in stsb-trainer/checkpoint-180/config.json
Model weights saved in stsb-trainer/checkpoint-180/pytorch_model.bin
tokenizer config file saved in stsb-trainer/checkpoint-180/tokenizer_config.json
Special tokens file saved in stsb-trainer/checkpoint-180/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  yo

In [17]:
run("wnli", 2, "stsb")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tra

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 635
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 140


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.688438,0.56338
2,No log,0.692742,0.464789
3,No log,0.693777,0.450704
4,No log,0.691511,0.56338
5,No log,0.694226,0.450704
6,No log,0.696077,0.366197
7,No log,0.69682,0.394366


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 71
  Batch size = 32
Saving model checkpoint to wnli-trainer/checkpoint-20
Configuration saved in wnli-trainer/checkpoint-20/config.json
Model weights saved in wnli-trainer/checkpoint-20/pytorch_model.bin
tokenizer config file saved in wnli-trainer/checkpoint-20/tokenizer_config.json
Special tokens file saved in wnli-trainer/checkpoint-20/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence2, sentence1. If idx, sentence2, sentence1 are not expected by `ElectraForSequenceClassification.forward`,  you can s

In [18]:
run("mnli", 3, "wnli")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/393 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise. If idx, hypothesis, premise are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 392702
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 85904


Epoch,Training Loss,Validation Loss,Accuracy
1,0.618,0.601395,0.763016
2,0.5418,0.570949,0.783902
3,0.4764,0.529574,0.798879
4,0.3912,0.538816,0.802955
5,0.3361,0.596635,0.804381
6,0.2791,0.635879,0.804279
7,0.2255,0.689345,0.80703


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise. If idx, hypothesis, premise are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9815
  Batch size = 32
Saving model checkpoint to mnli-trainer/checkpoint-12272
Configuration saved in mnli-trainer/checkpoint-12272/config.json
Model weights saved in mnli-trainer/checkpoint-12272/pytorch_model.bin
tokenizer config file saved in mnli-trainer/checkpoint-12272/tokenizer_config.json
Special tokens file saved in mnli-trainer/checkpoint-12272/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise. If idx, hypothesis, premise are not expected by `ElectraForSequenceClassification.forward

In [19]:
run("npc", 3, "mnli")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "0",
    "1": "1",
    "2": "unknown"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "0": 0,
    "1": 1,
    "unknown": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absol

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3327
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1040


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.066416,0.989183
2,No log,0.04195,0.99399
3,No log,0.029964,0.99399
4,No log,0.022454,0.997596
5,0.119100,0.029048,0.996394
6,0.119100,0.020444,0.996394
7,0.119100,0.008506,0.997596
8,0.119100,0.006137,0.998798
9,0.119100,0.004493,0.998798
10,0.005100,0.001759,1.0


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 832
  Batch size = 32
Saving model checkpoint to npc-trainer/checkpoint-104
Configuration saved in npc-trainer/checkpoint-104/config.json
Model weights saved in npc-trainer/checkpoint-104/pytorch_model.bin
tokenizer config file saved in npc-trainer/checkpoint-104/tokenizer_config.json
Special tokens file saved in npc-trainer/checkpoint-104/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evalua

In [20]:
run("fn", 54, "npc")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "112",
    "1": "1132",
    "2": "1210",
    "3": "1262",
    "4": "139",
    "5": "1390",
    "6": "16",
    "7": "160",
    "8": "1688",
    "9": "17",
    "10": "171",
    "11": "197",
    "12": "2135",
    "13": "2161",
    "14": "254",
    "15": "263",
    "16": "264",
    "17": "265",
    "18": "273",
    "19": "276",
    "20": "279",
    "21": "2824",
 

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3329
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1050
Trainer is attempting to log a value of "{0: '112', 1: '1132', 2: '1210', 3: '1262', 4: '139', 5: '1390', 6: '16', 7: '160', 8: '1688', 9: '17', 10: '171', 11: '197', 12: 

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.001414,0.335366
2,No log,1.732983,0.570732
3,No log,1.169175,0.757317
4,No log,0.912826,0.789024
5,1.957400,0.7951,0.809756
6,1.957400,0.748937,0.817073
7,1.957400,0.690337,0.813415
8,1.957400,0.660422,0.834146
9,1.957400,0.642226,0.832927
10,0.654800,0.638219,0.835366


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 820
  Batch size = 32
Saving model checkpoint to fn-trainer/checkpoint-105
Configuration saved in fn-trainer/checkpoint-105/config.json
Model weights saved in fn-trainer/checkpoint-105/pytorch_model.bin
tokenizer config file saved in fn-trainer/checkpoint-105/tokenizer_config.json
Special tokens file saved in fn-trainer/checkpoint-105/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation 

In [21]:
run("vn", 70, "fn")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "appear-48.1.1",
    "1": "assuming_position-50",
    "2": "banish-10.2",
    "3": "body_internal_states-40.6",
    "4": "break-45.1",
    "5": "bump-18.4",
    "6": "carry-11.4",
    "7": "carry-11.4-1-1",
    "8": "carve-21.2-2",
    "9": "chase-51.6",
    "10": "crane-40.3.2",
    "11": "cut-21.1-1",
    "12": "disassemble-23.3",
    "13": "eat-39.1-1",
   

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4718
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1480
Trainer is attempting to log a value of "{0: 'appear-48.1.1', 1: 'assuming_position-50', 2: 'banish-10.2', 3: 'body_internal_states-40.6', 4: 'break-45.1', 5: 'bump-18.4',

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.45975,0.469671
2,No log,1.531336,0.515598
3,No log,1.218851,0.556326
4,2.225600,1.074604,0.583189
5,2.225600,1.004782,0.595321
6,2.225600,0.953358,0.622184
7,1.008100,0.924729,0.618718
8,1.008100,0.898267,0.629983
9,1.008100,0.881558,0.631716
10,1.008100,0.87606,0.62565


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1154
  Batch size = 32
Saving model checkpoint to vn-trainer/checkpoint-148
Configuration saved in vn-trainer/checkpoint-148/config.json
Model weights saved in vn-trainer/checkpoint-148/pytorch_model.bin
tokenizer config file saved in vn-trainer/checkpoint-148/tokenizer_config.json
Special tokens file saved in vn-trainer/checkpoint-148/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation

In [22]:
run("wn", 84, "vn")

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "answer.v.01",
    "1": "ask.v.01",
    "2": "ask.v.02",
    "3": "blow.v.01",
    "4": "brandish.v.01",
    "5": "break.v.05",
    "6": "burn.v.01",
    "7": "buy.v.01",
    "8": "charge.v.17",
    "9": "choose.v.01",
    "10": "clean.v.01",
    "11": "climb.v.01",
    "12": "close.v.01",
    "13": "connect.v.01",
    "14": "consult.v.02",
    "15": "cut.v.01

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3138
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 990
Trainer is attempting to log a value of "{0: 'answer.v.01', 1: 'ask.v.01', 2: 'ask.v.02', 3: 'blow.v.01', 4: 'brandish.v.01', 5: 'break.v.05', 6: 'burn.v.01', 7: 'buy.v.01'

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.204472,0.473615
2,No log,1.584177,0.612137
3,No log,1.059297,0.766491
4,No log,0.778776,0.845646
5,No log,0.638297,0.869393
6,1.919400,0.541795,0.881266
7,1.919400,0.488972,0.890501
8,1.919400,0.456037,0.907652
9,1.919400,0.433751,0.908971
10,1.919400,0.427066,0.911609


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 758
  Batch size = 32
Saving model checkpoint to wn-trainer/checkpoint-99
Configuration saved in wn-trainer/checkpoint-99/config.json
Model weights saved in wn-trainer/checkpoint-99/pytorch_model.bin
tokenizer config file saved in wn-trainer/checkpoint-99/tokenizer_config.json
Special tokens file saved in wn-trainer/checkpoint-99/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****

In [23]:
for task, num_labels, prev in JERICHO_TASKS_FULL:
    if task == "fn_full":
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, label2id=fn_label2id, id2label=fn_id2label)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")
    elif task == "npc_full":
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, label2id=npc_label2id, id2label=npc_id2label)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")
    elif task == "vn_full":
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, label2id=vn_label2id, id2label=vn_id2label)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")
    else:
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, label2id=wn_label2id, id2label=wn_id2label)
        if prev is not None:
            model.electra = AutoModel.from_pretrained(f"{prev}-trainer")

    # print(model)
    if task == "fn_full":
        raw_datasets = fn_dataset
    elif task == "npc_full":
        raw_datasets = npc_dataset
    elif task == "vn_full":
        raw_datasets = vn_dataset
    else:
        raw_datasets = wn_dataset

    def tokenize_function(example):
        return tokenizer(example["sentence"], truncation=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    
    training_args = TrainingArguments(f"{task}-trainer",
                                      overwrite_output_dir=True,
                                      optim="adamw_torch",
                                      learning_rate=1e-4,
                                      weight_decay=0.01,
                                      warmup_ratio=0.1,
                                      adam_epsilon=1e-6,
                                      num_train_epochs=10.0,
                                      save_strategy="no",
                                      evaluation_strategy="no",
                                      
                                      # Debug
                                      #save_steps=2,
                                      #eval_steps=2,
                                      #max_steps=4,
                                      
                                      save_total_limit=1,
                                      load_best_model_at_end=True,
                                      per_device_train_batch_size=32,
                                      per_device_eval_batch_size=32)


    def compute_metrics(eval_preds):
        metric = load_metric("accuracy")
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)


    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["full"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.model.electra.save_pretrained(f"{task}-trainer")
    if prev != "npc_full" and os.path.exists(f"{prev}-trainer"):
        try:
            shutil.rmtree(f"{prev}-trainer")
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "0",
    "1": "1",
    "2": "unknown"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "0": 0,
    "1": 1,
    "unknown": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absol

Step,Training Loss
500,0.1764
1000,0.0288




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in npc_full-trainer/config.json
Model weights saved in npc_full-trainer/pytorch_model.bin
loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "112",
    "1": "1132",
    "2": "1210",
    "3": "1262",
    "4": "139",
    "5": "1390",
    "6": "16",
    "7": "160",
    "8": "1688",
    "9": "17",
    "10": "1

Step,Training Loss
500,2.1599
1000,0.7242




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in fn_full-trainer/config.json
Model weights saved in fn_full-trainer/pytorch_model.bin
loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "appear-48.1.1",
    "1": "assuming_position-50",
    "2": "banish-10.2",
    "3": "body_internal_states-40.6",
    "4": "break-45.1",
    "5": "bump-18.4",
    "6": "c

Step,Training Loss
500,2.362
1000,1.0654
1500,0.8534




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in vn_full-trainer/config.json
Model weights saved in vn_full-trainer/pytorch_model.bin
loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ap_default/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "answer.v.01",
    "1": "ask.v.01",
    "2": "ask.v.02",
    "3": "blow.v.01",
    "4": "brandish.v.01",
    "5": "break.v.05",
    "6": "burn.v.01",
    "7": "buy.v.01

Step,Training Loss
500,1.9241
1000,0.4461




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in wn_full-trainer/config.json
Model weights saved in wn_full-trainer/pytorch_model.bin


In [24]:
if_model = AutoModel.from_pretrained("wn_full-trainer")
tokenizer.save_pretrained("if_model")
if_model.save_pretrained("if_model")

from transformers import TFAutoModel

tf_model = TFAutoModel.from_pretrained("wn_full-trainer", from_pt=True)
tf_model.save_pretrained("if_model")

# from transformers import FlaxAutoModel
# 
# fx_model = FlaxAutoModel.from_pretrained("wn-trainer", from_pt=True)
# fx_model.save_pretrained("if-model")

if_model = AutoModel.from_pretrained("npc_full-trainer")
tokenizer.save_pretrained("npc_model")
if_model.save_pretrained("npc_model")

tf_model = TFAutoModel.from_pretrained("npc_full-trainer", from_pt=True)
tf_model.save_pretrained("npc_model")

try:
    shutil.rmtree("wn-trainer")
    shutil.rmtree("wn_full-trainer")
    shutil.rmtree("npc_full-trainer")
    shutil.rmtree("mlruns")
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))

loading configuration file wn_full-trainer/config.json
Model config ElectraConfig {
  "_name_or_path": "wn_full-trainer",
  "architectures": [
    "ElectraModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file wn_full-trainer/pytorch_model.bin
All model checkpoint weights were used when 

In [25]:
!zip if_model.zip -r if_model
!zip npc_model.zip -r npc_model

updating: if_model/ (stored 0%)
updating: if_model/tokenizer.json (deflated 71%)
updating: if_model/special_tokens_map.json (deflated 40%)
updating: if_model/tf_model.h5 (deflated 8%)
updating: if_model/pytorch_model.bin (deflated 7%)
updating: if_model/config.json (deflated 52%)
updating: if_model/vocab.txt (deflated 53%)
updating: if_model/tokenizer_config.json (deflated 40%)
updating: npc_model/ (stored 0%)
updating: npc_model/tokenizer.json (deflated 71%)
updating: npc_model/special_tokens_map.json (deflated 40%)
updating: npc_model/tf_model.h5 (deflated 8%)
updating: npc_model/pytorch_model.bin (deflated 7%)
updating: npc_model/config.json (deflated 52%)
updating: npc_model/vocab.txt (deflated 53%)
updating: npc_model/tokenizer_config.json (deflated 40%)
