In [1]:
from data_utils import CQADatasetLoader, SVAMPDatasetLoader, ESNLIDatasetLoader, ANLI1DatasetLoader, ASDivDatasetLoader


In [2]:
dataset_loader = ESNLIDatasetLoader()

In [3]:
dataset = dataset_loader.load_from_json()

In [4]:
train_llm_rationales, train_llm_labels = dataset_loader.load_llm_preds(split='train')
test_llm_rationales, test_llm_labels = dataset_loader.load_llm_preds(split='test')
valid_llm_rationales, valid_llm_labels = dataset_loader.load_llm_preds(split='valid')

In [5]:
dataset['train'] = dataset['train'].add_column('llm_label', train_llm_labels)
dataset['test'] = dataset['test'].add_column('llm_label', test_llm_labels)
dataset['train'] = dataset['train'].add_column('llm_rationale', train_llm_rationales)
dataset['test'] = dataset['test'].add_column('llm_rationale', test_llm_rationales)
dataset['valid'] = dataset['valid'].add_column('llm_label', valid_llm_labels)
dataset['valid'] = dataset['valid'].add_column('llm_rationale', valid_llm_rationales)

In [6]:
id2label = {0: "contradiction", 1: "entailment", 2: "neutral"}
label2id = {v: k for k, v in id2label.items()}

In [1]:
from t5_enc.t5 import T5ForConditionalGenerationAndSequenceClassification

model = T5ForConditionalGenerationAndSequenceClassification.from_pretrained("google/flan-t5-small", num_labels=3,
                                                                            id2label=id2label, label2id=label2id)

NameError: name 'id2label' is not defined

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
dataset = dataset.map(
    lambda example: {'input': tokenizer.eos_token.join([example['premise'], example['hypothesis']])},
    remove_columns=['premise', 'hypothesis'],
)

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'llm_label', 'llm_rationale', 'input'],
        num_rows: 549367
    })
    test: Dataset({
        features: ['label', 'llm_label', 'llm_rationale', 'input'],
        num_rows: 9824
    })
    valid: Dataset({
        features: ['label', 'llm_label', 'llm_rationale', 'input'],
        num_rows: 9842
    })
})

In [11]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input'], max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        rationale_output_encodings = tokenizer(examples['llm_rationale'], max_length=256, truncation=True)

    model_inputs['labels'] = rationale_output_encodings['input_ids']
    model_inputs['label'] = [label2id[e] for e in examples['label']]

    return model_inputs

In [12]:
tokenized_datasets = dataset.map(
    tokenize_function,
    remove_columns=['input', 'llm_label', 'llm_rationale', 'label'],
    batched=True
)

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]



In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 549367
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9824
    })
    valid: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9842
    })
})

In [14]:
from transformers import Seq2SeqTrainingArguments, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    remove_unused_columns = False,
    evaluation_strategy = 'steps',
    eval_steps=15,
    save_steps=15,
    save_total_limit=3,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=15,
    num_train_epochs=3,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    # metric_for_best_model="test_accuracy",
    gradient_accumulation_steps=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    seed=0,
    prediction_loss_only=False,
)

In [15]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [16]:

p = None
def compute_metrics(eval_pred):
    global p
    predictions, labels = eval_pred
    print(predictions)
    p = predictions

In [17]:
trainer_kwargs = {
    'alpha': 0.5,
    'model': model,
    'args': training_args,
    'train_dataset': tokenized_datasets["train"],
    'eval_dataset': {'test': tokenized_datasets["test"],},
    'data_collator': data_collator,
    'tokenizer': tokenizer,
    'compute_metrics': compute_metrics,
}

In [19]:
from t5_enc.t5 import MyTrainer

trainer = MyTrainer(**trainer_kwargs)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [25]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [21]:
trainer.predict([tokenized_datasets["train"][i] for i in range(10)])

(array(nan, dtype=float32), array([[[-43.344982  ,  -3.823163  ,  -9.568947  , ..., -43.30296   ,
         -43.30662   , -43.20291   ],
        [-36.69597   ,  -3.203759  ,  -6.599506  , ..., -36.667812  ,
         -36.73565   , -36.495316  ],
        [-41.270657  ,  -3.4887917 ,  -7.1263914 , ..., -41.15655   ,
         -41.26951   , -41.065056  ],
        ...,
        [-35.18772   ,  -2.294404  ,  -5.536607  , ..., -35.1947    ,
         -35.2855    , -35.026443  ],
        [-50.82319   ,  -2.5226564 ,  -9.210549  , ..., -50.89539   ,
         -50.91757   , -50.807384  ],
        [-56.56958   ,   1.0558381 , -10.8807125 , ..., -56.66071   ,
         -56.6109    , -56.714474  ]],

       [[-44.159786  ,  -3.5050118 ,  -8.90931   , ..., -44.143486  ,
         -44.124977  , -44.057106  ],
        [-37.438126  ,  -3.5394063 ,  -7.564574  , ..., -37.42482   ,
         -37.477608  , -37.265823  ],
        [-41.715214  ,  -3.9664586 ,  -7.501052  , ..., -41.614082  ,
         -41.708603  , 

TypeError: 'NoneType' object does not support item assignment

In [35]:
type(p)

numpy.ndarray

In [24]:
tokenized_datasets["train"][0]

{'label': 2,
 'input_ids': [71,
  568,
  30,
  3,
  9,
  4952,
  4418,
  7,
  147,
  3,
  9,
  4335,
  323,
  20527,
  5,
  1,
  71,
  568,
  19,
  761,
  112,
  4952,
  21,
  3,
  9,
  2259,
  5,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [37,
  568,
  228,
  36,
  761,
  112,
  4952,
  21,
  3,
  9,
  2259,
  6,
  68,
  34,
  19,
  59,
  6539,
  8,
  495,
  5,
  1]}

RuntimeError: Placeholder storage has not been allocated on MPS device!