In [None]:
from data_utils import CQADatasetLoader, SVAMPDatasetLoader, ESNLIDatasetLoader, ANLI1DatasetLoader, ASDivDatasetLoader


In [None]:
dataset_loader = ESNLIDatasetLoader()

In [None]:
dataset = dataset_loader.load_from_json()

In [None]:
train_llm_rationales, train_llm_labels = dataset_loader.load_llm_preds(split='train')
test_llm_rationales, test_llm_labels = dataset_loader.load_llm_preds(split='test')
valid_llm_rationales, valid_llm_labels = dataset_loader.load_llm_preds(split='valid')

In [None]:
dataset['train'] = dataset['train'].add_column('llm_label', train_llm_labels)
dataset['test'] = dataset['test'].add_column('llm_label', test_llm_labels)
dataset['train'] = dataset['train'].add_column('llm_rationale', train_llm_rationales)
dataset['test'] = dataset['test'].add_column('llm_rationale', test_llm_rationales)
dataset['valid'] = dataset['valid'].add_column('llm_label', valid_llm_labels)
dataset['valid'] = dataset['valid'].add_column('llm_rationale', valid_llm_rationales)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

In [None]:
dataset = dataset.map(
    lambda example: {'input': tokenizer.eos_token.join([example['premise'], example['hypothesis']])},
    remove_columns=['premise', 'hypothesis'],
)

In [None]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input'], max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        rationale_output_encodings = tokenizer(examples['llm_rationale'], max_length=256, truncation=True)

    model_inputs['labels'] = rationale_output_encodings['input_ids']
    model_inputs['label'] = [label2id[e] for e in examples['label']]

    return model_inputs

In [None]:
id2label = {0: "contradiction", 1: "entailment", 2: "neutral"}
label2id = {v: k for k, v in id2label.items()}

In [None]:
tokenized_datasets = dataset.map(
    tokenize_function,
    remove_columns=['input', 'llm_label', 'llm_rationale', 'label'],
    batched=True
)

In [None]:
tokenized_datasets.save_to_disk("tokenized_datasets")

In [1]:
id2label = {0: "contradiction", 1: "entailment", 2: "neutral"}
label2id = {v: k for k, v in id2label.items()}

In [2]:
from t5_enc.t5 import T5ForConditionalGenerationAndSequenceClassification

model = T5ForConditionalGenerationAndSequenceClassification.from_pretrained("google/flan-t5-small", num_labels=3,
                                                                            id2label=id2label, label2id=label2id)

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

Some weights of T5ForConditionalGenerationAndSequenceClassification were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['clf_head.bias', 'clf_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk("tokenized_datasets")

In [4]:
tokenized_datasets["train"] = tokenized_datasets['train'].train_test_split(test_size=0.1, seed=0)['test']
tokenized_datasets["test"] = tokenized_datasets['test'].train_test_split(test_size=0.1, seed=0)['test']

In [5]:
tokenized_datasets



DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 54937
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 983
    })
    valid: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9842
    })
})

In [6]:
from transformers import Seq2SeqTrainingArguments, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    remove_unused_columns = False,
    evaluation_strategy = 'steps',
    eval_steps=500,
    save_steps=500,
    save_total_limit=3,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=250,
    num_train_epochs=3,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="test_accuracy",
    gradient_accumulation_steps=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    seed=0,
    prediction_loss_only=False,
)

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [8]:
p = None
def compute_metrics(eval_pred):
    global p
    predictions, labels = eval_pred
    pred = predictions[2]
    true = labels[0]
    pred_am = pred.argmax(1)
    acc = (pred_am == true).mean()
    return {'accuracy': acc}

In [9]:
trainer_kwargs = {
    'alpha': 0.8,
    'model': model,
    'args': training_args,
    'train_dataset': tokenized_datasets["train"],
    'eval_dataset': {'test': tokenized_datasets["test"],},
    'data_collator': data_collator,
    'tokenizer': tokenizer,
    'compute_metrics': compute_metrics,
}

In [10]:
from transformers import T5Config, Seq2SeqTrainer, Trainer
# trainer = Trainer(**trainer_kwargs)

In [11]:
from t5_enc.t5 import MyTrainer

trainer = MyTrainer(**trainer_kwargs)

In [12]:
# data = data_collator([tokenized_datasets["train"][i] for i in range(1)])
# data = data.to('cuda')
# model.to('cuda')

In [13]:
# model(**data)

In [None]:
trainer.train()

In [None]:
from transformers.utils import is_sagemaker_mp_enabled
is_sagemaker_mp_enabled()

In [None]:
loss, logits, labels = trainer.prediction_step(model, data, False)

In [None]:
logits

In [None]:
trainer.predict([tokenized_datasets["train"][i] for i in range(2)])

In [None]:
trainer.label_names

In [None]:
labels

In [None]:
p[1].shape

In [None]:
p[0].argmax(1)

In [None]:
(p[1] == p[0].argmax(1)).mean()

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
0. проанализировать лосс
1. сначала генерация потом клф
1.5. по шагам сначала декодер, потом клф
2. чередование
3. clipping
4. 2-stage distillation (gpt-4 -> t5-large (gen) -> t5-small (gen, kl-div + clf)
5. 2-головые модели, как складывать лоссы