In [1]:
from data_utils import CQADatasetLoader, SVAMPDatasetLoader, ESNLIDatasetLoader, ANLI1DatasetLoader, ASDivDatasetLoader


In [2]:
dataset_loader = ESNLIDatasetLoader()

In [3]:
dataset = dataset_loader.load_from_json()

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

In [4]:
train_llm_rationales, train_llm_labels = dataset_loader.load_llm_preds(split='train')
test_llm_rationales, test_llm_labels = dataset_loader.load_llm_preds(split='test')
valid_llm_rationales, valid_llm_labels = dataset_loader.load_llm_preds(split='valid')

In [5]:
dataset['train'] = dataset['train'].add_column('llm_label', train_llm_labels)
dataset['test'] = dataset['test'].add_column('llm_label', test_llm_labels)
dataset['train'] = dataset['train'].add_column('llm_rationale', train_llm_rationales)
dataset['test'] = dataset['test'].add_column('llm_rationale', test_llm_rationales)
dataset['valid'] = dataset['valid'].add_column('llm_label', valid_llm_labels)
dataset['valid'] = dataset['valid'].add_column('llm_rationale', valid_llm_rationales)

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
dataset = dataset.map(
    lambda example: {'input': tokenizer.eos_token.join([example['premise'], example['hypothesis']])},
    remove_columns=['premise', 'hypothesis'],
)

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

In [11]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input'], max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        rationale_output_encodings = tokenizer(examples['llm_rationale'], max_length=256, truncation=True)

    model_inputs['labels'] = rationale_output_encodings['input_ids']
    model_inputs['label'] = [label2id[e] for e in examples['label']]

    return model_inputs

In [12]:
tokenized_datasets = dataset.map(
    tokenize_function,
    remove_columns=['input', 'llm_label', 'llm_rationale', 'label'],
    batched=True
)

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]



Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

In [28]:
tokenized_datasets.save_to_disk("tokenized_datasets")

Saving the dataset (0/1 shards):   0%|          | 0/549367 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9824 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9842 [00:00<?, ? examples/s]

In [1]:
id2label = {0: "contradiction", 1: "entailment", 2: "neutral"}
label2id = {v: k for k, v in id2label.items()}

In [15]:
from t5_enc.t5 import T5ForConditionalGenerationAndSequenceClassification

model = T5ForConditionalGenerationAndSequenceClassification.from_pretrained("google/flan-t5-small", num_labels=3,
                                                                            id2label=id2label, label2id=label2id)

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

Some weights of T5ForConditionalGenerationAndSequenceClassification were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['clf_head.bias', 'clf_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk("tokenized_datasets")

In [4]:
tokenized_datasets["train"] = tokenized_datasets['train'].train_test_split(test_size=0.1, seed=0)['test']
tokenized_datasets["test"] = tokenized_datasets['test'].train_test_split(test_size=0.1, seed=0)['test']

In [5]:
tokenized_datasets



DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 54937
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 983
    })
    valid: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9842
    })
})

In [16]:
from transformers import Seq2SeqTrainingArguments, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    remove_unused_columns = False,
    evaluation_strategy = 'steps',
    eval_steps=500,
    save_steps=500,
    save_total_limit=3,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=250,
    num_train_epochs=3,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="test_accuracy",
    gradient_accumulation_steps=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    seed=0,
    prediction_loss_only=False,
)

In [17]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [18]:
p = None
def compute_metrics(eval_pred):
    global p
    predictions, labels = eval_pred
    pred = predictions[2]
    true = labels[0]
    pred_am = pred.argmax(1)
    acc = (pred_am == true).mean()
    return {'accuracy': acc}

In [19]:
trainer_kwargs = {
    'alpha': 0.999,
    'model': model,
    'args': training_args,
    'train_dataset': tokenized_datasets["train"],
    'eval_dataset': {'test': tokenized_datasets["test"],},
    'data_collator': data_collator,
    'tokenizer': tokenizer,
    'compute_metrics': compute_metrics,
}

In [20]:
from transformers import T5Config, Seq2SeqTrainer, Trainer
# trainer = Trainer(**trainer_kwargs)

In [21]:
from t5_enc.t5 import MyTrainer

trainer = MyTrainer(**trainer_kwargs)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [22]:
# data = data_collator([tokenized_datasets["train"][i] for i in range(1)])
# data = data.to('cuda')
# model.to('cuda')

In [23]:
# model(**data)

In [24]:
trainer.train()

Step,Training Loss,Validation Loss,Test Loss,Test Accuracy
500,0.0,No log,inf,0.344863
1000,0.0,No log,inf,0.344863


KeyboardInterrupt: 

In [62]:
from transformers.utils import is_sagemaker_mp_enabled
is_sagemaker_mp_enabled()

False

In [61]:
loss, logits, labels = trainer.prediction_step(model, data, False)

In [64]:
logits

(tensor(1.0986, device='cuda:0'),
 tensor([[[-43.3450,  -3.8232,  -9.5689,  ..., -43.3030, -43.3066, -43.2029],
          [-36.6960,  -3.2038,  -6.5995,  ..., -36.6678, -36.7356, -36.4953],
          [-41.2707,  -3.4888,  -7.1264,  ..., -41.1566, -41.2695, -41.0651],
          ...,
          [-35.1877,  -2.2944,  -5.5366,  ..., -35.1947, -35.2855, -35.0264],
          [-50.8232,  -2.5227,  -9.2106,  ..., -50.8954, -50.9176, -50.8074],
          [-56.5695,   1.0558, -10.8807,  ..., -56.6607, -56.6109, -56.7145]]],
        device='cuda:0'),
 tensor([[2.8923e-42, 0.0000e+00, 1.1210e-43]], device='cuda:0'),
 tensor([[[-0.2601, -0.0773,  0.1274,  ..., -0.1361, -0.0117, -0.1421],
          [-0.2268, -0.0288,  0.1280,  ..., -0.2131, -0.0716, -0.0701],
          [-0.1313, -0.0290, -0.0152,  ...,  0.0543,  0.0155, -0.0723],
          ...,
          [ 0.1917, -0.0318, -0.0480,  ...,  0.1162, -0.0731, -0.0015],
          [-0.0740,  0.0112, -0.1586,  ..., -0.0183, -0.1629, -0.0748],
          [-0.

In [109]:
trainer.predict([tokenized_datasets["train"][i] for i in range(2)])

PredictionOutput(predictions=(array(1.0986123, dtype=float32), array([[[-43.34498   ,  -3.8231604 ,  -9.568944  , ..., -43.302944  ,
         -43.306618  , -43.202892  ],
        [-36.69596   ,  -3.2037597 ,  -6.599509  , ..., -36.66781   ,
         -36.735657  , -36.495323  ],
        [-41.270668  ,  -3.4887943 ,  -7.1263914 , ..., -41.156574  ,
         -41.269527  , -41.065075  ],
        ...,
        [-35.18772   ,  -2.2944055 ,  -5.536606  , ..., -35.19469   ,
         -35.2855    , -35.026443  ],
        [-50.823196  ,  -2.522661  ,  -9.210551  , ..., -50.89539   ,
         -50.917576  , -50.807392  ],
        [-56.569576  ,   1.0558416 , -10.880717  , ..., -56.66072   ,
         -56.6109    , -56.71448   ]],

       [[-44.15978   ,  -3.505008  ,  -8.9093075 , ..., -44.14348   ,
         -44.124973  , -44.057106  ],
        [-37.43812   ,  -3.539407  ,  -7.5645704 , ..., -10.490766  ,
          -8.716219  ,  -4.3990593 ],
        [-41.715206  ,  -8.8815565 ,  -4.810776  , ..., -1

In [57]:
trainer.label_names

['label', 'labels']

In [55]:
labels

(tensor([2], device='cuda:0'),
 tensor([[  37,  568,  228,   36,  761,  112, 4952,   21,    3,    9, 2259,    6,
            68,   34,   19,   59, 6539,    8,  495,    5,    1]],
        device='cuda:0'))

In [87]:
p[1].shape

TypeError: 'NoneType' object is not subscriptable

IndexError: piece id is out of range.

In [97]:
p[0].argmax(1)

array([0, 0])

In [104]:
(p[1] == p[0].argmax(1)).mean()

0.5

In [100]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

ImportError: To be able to use evaluate-metric/accuracy, you need to install the following dependencies['scikit-learn'] using 'pip install sklearn' for instance'

In [None]:
0. проанализировать лосс
1. сначала генерация потом клф
1.5. по шагам сначала декодер, потом клф
2. чередование
3. clipping
4. 2-stage distillation (gpt-4 -> t5-large (gen) -> t5-small (gen, kl-div + clf)
5. 2-головые модели, как складывать лоссы